diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-15 10:46:16 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-15 10:46:16 -0700 |
commit | 93b694d096cc10994c817730d4d50288f9ae3d66 (patch) | |
tree | 5bd967686d0003f7dbbe1da49f5399cb4a92f074 /drivers/gpu/drm/i915/gt | |
parent | 726eb70e0d34dc4bc4dada71f52bba8ed638431e (diff) | |
parent | 640eee067d9aae0bb98d8706001976ff1affaf00 (diff) |
Merge tag 'drm-next-2020-10-15' of git://anongit.freedesktop.org/drm/drm
Pull drm updates from Dave Airlie:
"Not a major amount of change, the i915 trees got split into display
and gt trees to better facilitate higher level review, and there's a
major refactoring of i915 GEM locking to use more core kernel concepts
(like ww-mutexes). msm gets per-process pagetables, older AMD SI cards
get DC support, nouveau got a bump in displayport support with common
code extraction from i915.
Outside of drm this contains a couple of patches for hexint
moduleparams which you've acked, and a virtio common code tree that
you should also get via it's regular path.
New driver:
- Cadence MHDP8546 DisplayPort bridge driver
core:
- cross-driver scatterlist cleanups
- devm_drm conversions
- remove drm_dev_init
- devm_drm_dev_alloc conversion
ttm:
- lots of refactoring and cleanups
bridges:
- chained bridge support in more drivers
panel:
- misc new panels
scheduler:
- cleanup priority levels
displayport:
- refactor i915 code into helpers for nouveau
i915:
- split into display and GT trees
- WW locking refactoring in GEM
- execbuf2 extension mechanism
- syncobj timeline support
- GEN 12 HOBL display powersaving
- Rocket Lake display additions
- Disable FBC on Tigerlake
- Tigerlake Type-C + DP improvements
- Hotplug interrupt refactoring
amdgpu:
- Sienna Cichlid updates
- Navy Flounder updates
- DCE6 (SI) support for DC
- Plane rotation enabled
- TMZ state info ioctl
- PCIe DPC recovery support
- DC interrupt handling refactor
- OLED panel fixes
amdkfd:
- add SMI events for thermal throttling
- SMI interface events ioctl update
- process eviction counters
radeon:
- move to dma_ for allocations
- expose sclk via sysfs
msm:
- DSI support for sm8150/sm8250
- per-process GPU pagetable support
- Displayport support
mediatek:
- move HDMI phy driver to PHY
- convert mtk-dpi to bridge API
- disable mt2701 tmds
tegra:
- bridge support
exynos:
- misc cleanups
vc4:
- dual display cleanups
ast:
- cleanups
gma500:
- conversion to GPIOd API
hisilicon:
- misc reworks
ingenic:
- clock handling and format improvements
mcde:
- DSI support
mgag200:
- desktop g200 support
mxsfb:
- i.MX7 + i.MX8M
- alpha plane support
panfrost:
- devfreq support
- amlogic SoC support
ps8640:
- EDID from eDP retrieval
tidss:
- AM65xx YUV workaround
virtio:
- virtio-gpu exported resources
rcar-du:
- R8A7742, R8A774E1 and R8A77961 support
- YUV planar format fixes
- non-visible plane handling
- VSP device reference count fix
- Kconfig fix to avoid displaying disabled options in .config"
* tag 'drm-next-2020-10-15' of git://anongit.freedesktop.org/drm/drm: (1494 commits)
drm/ingenic: Fix bad revert
drm/amdgpu: Fix invalid number of character '{' in amdgpu_acpi_init
drm/amdgpu: Remove warning for virtual_display
drm/amdgpu: kfd_initialized can be static
drm/amd/pm: setup APU dpm clock table in SMU HW initialization
drm/amdgpu: prevent spurious warning
drm/amdgpu/swsmu: fix ARC build errors
drm/amd/display: Fix OPTC_DATA_FORMAT programming
drm/amd/display: Don't allow pstate if no support in blank
drm/panfrost: increase readl_relaxed_poll_timeout values
MAINTAINERS: Update entry for st7703 driver after the rename
Revert "gpu/drm: ingenic: Add option to mmap GEM buffers cached"
drm/amd/display: HDMI remote sink need mode validation for Linux
drm/amd/display: Change to correct unit on audio rate
drm/amd/display: Avoid set zero in the requested clk
drm/amdgpu: align frag_end to covered address space
drm/amdgpu: fix NULL pointer dereference for Renoir
drm/vmwgfx: fix regression in thp code due to ttm init refactor.
drm/amdgpu/swsmu: add interrupt work handler for smu11 parts
drm/amdgpu/swsmu: add interrupt work function
...
Diffstat (limited to 'drivers/gpu/drm/i915/gt')
42 files changed, 1568 insertions, 1156 deletions
diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c index cdc0b9c54305..fd0d24d28763 100644 --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c @@ -16,8 +16,10 @@ static inline void gen6_write_pde(const struct gen6_ppgtt *ppgtt, const unsigned int pde, const struct i915_page_table *pt) { + dma_addr_t addr = pt ? px_dma(pt) : px_dma(ppgtt->base.vm.scratch[1]); + /* Caller needs to make sure the write completes if necessary */ - iowrite32(GEN6_PDE_ADDR_ENCODE(px_dma(pt)) | GEN6_PDE_VALID, + iowrite32(GEN6_PDE_ADDR_ENCODE(addr) | GEN6_PDE_VALID, ppgtt->pd_addr + pde); } @@ -79,7 +81,7 @@ static void gen6_ppgtt_clear_range(struct i915_address_space *vm, { struct gen6_ppgtt * const ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm)); const unsigned int first_entry = start / I915_GTT_PAGE_SIZE; - const gen6_pte_t scratch_pte = vm->scratch[0].encode; + const gen6_pte_t scratch_pte = vm->scratch[0]->encode; unsigned int pde = first_entry / GEN6_PTES; unsigned int pte = first_entry % GEN6_PTES; unsigned int num_entries = length / I915_GTT_PAGE_SIZE; @@ -90,8 +92,6 @@ static void gen6_ppgtt_clear_range(struct i915_address_space *vm, const unsigned int count = min(num_entries, GEN6_PTES - pte); gen6_pte_t *vaddr; - GEM_BUG_ON(px_base(pt) == px_base(&vm->scratch[1])); - num_entries -= count; GEM_BUG_ON(count > atomic_read(&pt->used)); @@ -127,7 +127,7 @@ static void gen6_ppgtt_insert_entries(struct i915_address_space *vm, struct sgt_dma iter = sgt_dma(vma); gen6_pte_t *vaddr; - GEM_BUG_ON(pd->entry[act_pt] == &vm->scratch[1]); + GEM_BUG_ON(!pd->entry[act_pt]); vaddr = kmap_atomic_px(i915_pt_entry(pd, act_pt)); do { @@ -177,39 +177,36 @@ static void gen6_flush_pd(struct gen6_ppgtt *ppgtt, u64 start, u64 end) mutex_unlock(&ppgtt->flush); } -static int gen6_alloc_va_range(struct i915_address_space *vm, - u64 start, u64 length) +static void gen6_alloc_va_range(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 start, u64 length) { struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm)); struct i915_page_directory * const pd = ppgtt->base.pd; - struct i915_page_table *pt, *alloc = NULL; + struct i915_page_table *pt; bool flush = false; u64 from = start; unsigned int pde; - int ret = 0; spin_lock(&pd->lock); gen6_for_each_pde(pt, pd, start, length, pde) { const unsigned int count = gen6_pte_count(start, length); - if (px_base(pt) == px_base(&vm->scratch[1])) { + if (!pt) { spin_unlock(&pd->lock); - pt = fetch_and_zero(&alloc); - if (!pt) - pt = alloc_pt(vm); - if (IS_ERR(pt)) { - ret = PTR_ERR(pt); - goto unwind_out; - } + pt = stash->pt[0]; + __i915_gem_object_pin_pages(pt->base); + i915_gem_object_make_unshrinkable(pt->base); - fill32_px(pt, vm->scratch[0].encode); + fill32_px(pt, vm->scratch[0]->encode); spin_lock(&pd->lock); - if (pd->entry[pde] == &vm->scratch[1]) { + if (!pd->entry[pde]) { + stash->pt[0] = pt->stash; + atomic_set(&pt->used, 0); pd->entry[pde] = pt; } else { - alloc = pt; pt = pd->entry[pde]; } @@ -226,38 +223,32 @@ static int gen6_alloc_va_range(struct i915_address_space *vm, with_intel_runtime_pm(&vm->i915->runtime_pm, wakeref) gen6_flush_pd(ppgtt, from, start); } - - goto out; - -unwind_out: - gen6_ppgtt_clear_range(vm, from, start - from); -out: - if (alloc) - free_px(vm, alloc); - return ret; } static int gen6_ppgtt_init_scratch(struct gen6_ppgtt *ppgtt) { struct i915_address_space * const vm = &ppgtt->base.vm; - struct i915_page_directory * const pd = ppgtt->base.pd; int ret; - ret = setup_scratch_page(vm, __GFP_HIGHMEM); + ret = setup_scratch_page(vm); if (ret) return ret; - vm->scratch[0].encode = - vm->pte_encode(px_dma(&vm->scratch[0]), + vm->scratch[0]->encode = + vm->pte_encode(px_dma(vm->scratch[0]), I915_CACHE_NONE, PTE_READ_ONLY); - if (unlikely(setup_page_dma(vm, px_base(&vm->scratch[1])))) { - cleanup_scratch_page(vm); - return -ENOMEM; + vm->scratch[1] = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(vm->scratch[1])) + return PTR_ERR(vm->scratch[1]); + + ret = pin_pt_dma(vm, vm->scratch[1]); + if (ret) { + i915_gem_object_put(vm->scratch[1]); + return ret; } - fill32_px(&vm->scratch[1], vm->scratch[0].encode); - memset_p(pd->entry, &vm->scratch[1], I915_PDES); + fill32_px(vm->scratch[1], vm->scratch[0]->encode); return 0; } @@ -265,14 +256,12 @@ static int gen6_ppgtt_init_scratch(struct gen6_ppgtt *ppgtt) static void gen6_ppgtt_free_pd(struct gen6_ppgtt *ppgtt) { struct i915_page_directory * const pd = ppgtt->base.pd; - struct i915_page_dma * const scratch = - px_base(&ppgtt->base.vm.scratch[1]); struct i915_page_table *pt; u32 pde; gen6_for_all_pdes(pt, pd, pde) - if (px_base(pt) != scratch) - free_px(&ppgtt->base.vm, pt); + if (pt) + free_pt(&ppgtt->base.vm, pt); } static void gen6_ppgtt_cleanup(struct i915_address_space *vm) @@ -286,7 +275,8 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm) mutex_destroy(&ppgtt->flush); mutex_destroy(&ppgtt->pin_mutex); - kfree(ppgtt->base.pd); + + free_pd(&ppgtt->base.vm, ppgtt->base.pd); } static int pd_vma_set_pages(struct i915_vma *vma) @@ -302,28 +292,26 @@ static void pd_vma_clear_pages(struct i915_vma *vma) vma->pages = NULL; } -static int pd_vma_bind(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 unused) +static void pd_vma_bind(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 unused) { struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); struct gen6_ppgtt *ppgtt = vma->private; u32 ggtt_offset = i915_ggtt_offset(vma) / I915_GTT_PAGE_SIZE; - px_base(ppgtt->base.pd)->ggtt_offset = ggtt_offset * sizeof(gen6_pte_t); + ppgtt->pp_dir = ggtt_offset * sizeof(gen6_pte_t) << 10; ppgtt->pd_addr = (gen6_pte_t __iomem *)ggtt->gsm + ggtt_offset; gen6_flush_pd(ppgtt, 0, ppgtt->base.vm.total); - return 0; } static void pd_vma_unbind(struct i915_address_space *vm, struct i915_vma *vma) { struct gen6_ppgtt *ppgtt = vma->private; struct i915_page_directory * const pd = ppgtt->base.pd; - struct i915_page_dma * const scratch = - px_base(&ppgtt->base.vm.scratch[1]); struct i915_page_table *pt; unsigned int pde; @@ -332,11 +320,11 @@ static void pd_vma_unbind(struct i915_address_space *vm, struct i915_vma *vma) /* Free all no longer used page tables */ gen6_for_all_pdes(pt, ppgtt->base.pd, pde) { - if (px_base(pt) == scratch || atomic_read(&pt->used)) + if (!pt || atomic_read(&pt->used)) continue; - free_px(&ppgtt->base.vm, pt); - pd->entry[pde] = scratch; + free_pt(&ppgtt->base.vm, pt); + pd->entry[pde] = NULL; } ppgtt->scan_for_unused_pt = false; @@ -380,7 +368,7 @@ static struct i915_vma *pd_vma_create(struct gen6_ppgtt *ppgtt, int size) return vma; } -int gen6_ppgtt_pin(struct i915_ppgtt *base) +int gen6_ppgtt_pin(struct i915_ppgtt *base, struct i915_gem_ww_ctx *ww) { struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(base); int err; @@ -406,7 +394,7 @@ int gen6_ppgtt_pin(struct i915_ppgtt *base) */ err = 0; if (!atomic_read(&ppgtt->pin_count)) - err = i915_ggtt_pin(ppgtt->vma, GEN6_PD_ALIGN, PIN_HIGH); + err = i915_ggtt_pin(ppgtt->vma, ww, GEN6_PD_ALIGN, PIN_HIGH); if (!err) atomic_inc(&ppgtt->pin_count); mutex_unlock(&ppgtt->pin_mutex); @@ -448,6 +436,7 @@ struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt) mutex_init(&ppgtt->pin_mutex); ppgtt_init(&ppgtt->base, gt); + ppgtt->base.vm.pd_shift = ilog2(SZ_4K * SZ_4K / sizeof(gen6_pte_t)); ppgtt->base.vm.top = 1; ppgtt->base.vm.bind_async_flags = I915_VMA_LOCAL_BIND; @@ -456,9 +445,10 @@ struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt) ppgtt->base.vm.insert_entries = gen6_ppgtt_insert_entries; ppgtt->base.vm.cleanup = gen6_ppgtt_cleanup; + ppgtt->base.vm.alloc_pt_dma = alloc_pt_dma; ppgtt->base.vm.pte_encode = ggtt->vm.pte_encode; - ppgtt->base.pd = __alloc_pd(sizeof(*ppgtt->base.pd)); + ppgtt->base.pd = __alloc_pd(I915_PDES); if (!ppgtt->base.pd) { err = -ENOMEM; goto err_free; @@ -479,7 +469,7 @@ struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt) err_scratch: free_scratch(&ppgtt->base.vm); err_pd: - kfree(ppgtt->base.pd); + free_pd(&ppgtt->base.vm, ppgtt->base.pd); err_free: mutex_destroy(&ppgtt->pin_mutex); kfree(ppgtt); diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.h b/drivers/gpu/drm/i915/gt/gen6_ppgtt.h index 72e481806c96..3357228f3304 100644 --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.h +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.h @@ -8,12 +8,15 @@ #include "intel_gtt.h" +struct i915_gem_ww_ctx; + struct gen6_ppgtt { struct i915_ppgtt base; struct mutex flush; struct i915_vma *vma; gen6_pte_t __iomem *pd_addr; + u32 pp_dir; atomic_t pin_count; struct mutex pin_mutex; @@ -66,7 +69,7 @@ static inline struct gen6_ppgtt *to_gen6_ppgtt(struct i915_ppgtt *base) (pt = i915_pt_entry(pd, iter), true); \ ++iter) -int gen6_ppgtt_pin(struct i915_ppgtt *base); +int gen6_ppgtt_pin(struct i915_ppgtt *base, struct i915_gem_ww_ctx *ww); void gen6_ppgtt_unpin(struct i915_ppgtt *base); void gen6_ppgtt_unpin_all(struct i915_ppgtt *base); void gen6_ppgtt_enable(struct intel_gt *gt); diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index 699125928272..eb64f474a78c 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -181,7 +181,7 @@ static void __gen8_ppgtt_cleanup(struct i915_address_space *vm, } while (pde++, --count); } - free_px(vm, pd); + free_px(vm, &pd->pt, lvl); } static void gen8_ppgtt_cleanup(struct i915_address_space *vm) @@ -199,7 +199,7 @@ static u64 __gen8_ppgtt_clear(struct i915_address_space * const vm, struct i915_page_directory * const pd, u64 start, const u64 end, int lvl) { - const struct i915_page_scratch * const scratch = &vm->scratch[lvl]; + const struct drm_i915_gem_object * const scratch = vm->scratch[lvl]; unsigned int idx, len; GEM_BUG_ON(end > vm->total >> GEN8_PTE_SHIFT); @@ -239,7 +239,7 @@ static u64 __gen8_ppgtt_clear(struct i915_address_space * const vm, vaddr = kmap_atomic_px(pt); memset64(vaddr + gen8_pd_index(start, 0), - vm->scratch[0].encode, + vm->scratch[0]->encode, count); kunmap_atomic(vaddr); @@ -248,7 +248,7 @@ static u64 __gen8_ppgtt_clear(struct i915_address_space * const vm, } if (release_pd_entry(pd, idx, pt, scratch)) - free_px(vm, pt); + free_px(vm, pt, lvl); } while (idx++, --len); return start; @@ -269,14 +269,12 @@ static void gen8_ppgtt_clear(struct i915_address_space *vm, start, start + length, vm->top); } -static int __gen8_ppgtt_alloc(struct i915_address_space * const vm, - struct i915_page_directory * const pd, - u64 * const start, const u64 end, int lvl) +static void __gen8_ppgtt_alloc(struct i915_address_space * const vm, + struct i915_vm_pt_stash *stash, + struct i915_page_directory * const pd, + u64 * const start, const u64 end, int lvl) { - const struct i915_page_scratch * const scratch = &vm->scratch[lvl]; - struct i915_page_table *alloc = NULL; unsigned int idx, len; - int ret = 0; GEM_BUG_ON(end > vm->total >> GEN8_PTE_SHIFT); @@ -297,49 +295,31 @@ static int __gen8_ppgtt_alloc(struct i915_address_space * const vm, DBG("%s(%p):{ lvl:%d, idx:%d } allocating new tree\n", __func__, vm, lvl + 1, idx); - pt = fetch_and_zero(&alloc); - if (lvl) { - if (!pt) { - pt = &alloc_pd(vm)->pt; - if (IS_ERR(pt)) { - ret = PTR_ERR(pt); - goto out; - } - } - - fill_px(pt, vm->scratch[lvl].encode); - } else { - if (!pt) { - pt = alloc_pt(vm); - if (IS_ERR(pt)) { - ret = PTR_ERR(pt); - goto out; - } - } - - if (intel_vgpu_active(vm->i915) || - gen8_pt_count(*start, end) < I915_PDES) - fill_px(pt, vm->scratch[lvl].encode); - } + pt = stash->pt[!!lvl]; + __i915_gem_object_pin_pages(pt->base); + i915_gem_object_make_unshrinkable(pt->base); + + if (lvl || + gen8_pt_count(*start, end) < I915_PDES || + intel_vgpu_active(vm->i915)) + fill_px(pt, vm->scratch[lvl]->encode); spin_lock(&pd->lock); - if (likely(!pd->entry[idx])) + if (likely(!pd->entry[idx])) { + stash->pt[!!lvl] = pt->stash; + atomic_set(&pt->used, 0); set_pd_entry(pd, idx, pt); - else - alloc = pt, pt = pd->entry[idx]; + } else { + pt = pd->entry[idx]; + } } if (lvl) { atomic_inc(&pt->used); spin_unlock(&pd->lock); - ret = __gen8_ppgtt_alloc(vm, as_pd(pt), - start, end, lvl); - if (unlikely(ret)) { - if (release_pd_entry(pd, idx, pt, scratch)) - free_px(vm, pt); - goto out; - } + __gen8_ppgtt_alloc(vm, stash, + as_pd(pt), start, end, lvl); spin_lock(&pd->lock); atomic_dec(&pt->used); @@ -359,18 +339,12 @@ static int __gen8_ppgtt_alloc(struct i915_address_space * const vm, } } while (idx++, --len); spin_unlock(&pd->lock); -out: - if (alloc) - free_px(vm, alloc); - return ret; } -static int gen8_ppgtt_alloc(struct i915_address_space *vm, - u64 start, u64 length) +static void gen8_ppgtt_alloc(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 start, u64 length) { - u64 from; - int err; - GEM_BUG_ON(!IS_ALIGNED(start, BIT_ULL(GEN8_PTE_SHIFT))); GEM_BUG_ON(!IS_ALIGNED(length, BIT_ULL(GEN8_PTE_SHIFT))); GEM_BUG_ON(range_overflows(start, length, vm->total)); @@ -378,25 +352,9 @@ static int gen8_ppgtt_alloc(struct i915_address_space *vm, start >>= GEN8_PTE_SHIFT; length >>= GEN8_PTE_SHIFT; GEM_BUG_ON(length == 0); - from = start; - err = __gen8_ppgtt_alloc(vm, i915_vm_to_ppgtt(vm)->pd, - &start, start + length, vm->top); - if (unlikely(err && from != start)) - __gen8_ppgtt_clear(vm, i915_vm_to_ppgtt(vm)->pd, - from, start, vm->top); - - return err; -} - -static __always_inline void -write_pte(gen8_pte_t *pte, const gen8_pte_t val) -{ - /* Magic delays? Or can we refine these to flush all in one pass? */ - *pte = val; - wmb(); /* cpu to cache */ - clflush(pte); /* cache to memory */ - wmb(); /* visible to all */ + __gen8_ppgtt_alloc(vm, stash, i915_vm_to_ppgtt(vm)->pd, + &start, start + length, vm->top); } static __always_inline u64 @@ -415,8 +373,7 @@ gen8_ppgtt_insert_pte(struct i915_ppgtt *ppgtt, vaddr = kmap_atomic_px(i915_pt_entry(pd, gen8_pd_index(idx, 1))); do { GEM_BUG_ON(iter->sg->length < I915_GTT_PAGE_SIZE); - write_pte(&vaddr[gen8_pd_index(idx, 0)], - pte_encode | iter->dma); + vaddr[gen8_pd_index(idx, 0)] = pte_encode | iter->dma; iter->dma += I915_GTT_PAGE_SIZE; if (iter->dma >= iter->max) { @@ -439,10 +396,12 @@ gen8_ppgtt_insert_pte(struct i915_ppgtt *ppgtt, pd = pdp->entry[gen8_pd_index(idx, 2)]; } + clflush_cache_range(vaddr, PAGE_SIZE); kunmap_atomic(vaddr); vaddr = kmap_atomic_px(i915_pt_entry(pd, gen8_pd_index(idx, 1))); } } while (1); + clflush_cache_range(vaddr, PAGE_SIZE); kunmap_atomic(vaddr); return idx; @@ -498,7 +457,7 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, do { GEM_BUG_ON(iter->sg->length < page_size); - write_pte(&vaddr[index++], encode | iter->dma); + vaddr[index++] = encode | iter->dma; start += page_size; iter->dma += page_size; @@ -523,6 +482,7 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, } } while (rem >= page_size && index < I915_PDES); + clflush_cache_range(vaddr, PAGE_SIZE); kunmap_atomic(vaddr); /* @@ -554,7 +514,7 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, if (I915_SELFTEST_ONLY(vma->vm->scrub_64K)) { u16 i; - encode = vma->vm->scratch[0].encode; + encode = vma->vm->scratch[0]->encode; vaddr = kmap_atomic_px(i915_pt_entry(pd, maybe_64K)); for (i = 1; i < index; i += 16) @@ -608,27 +568,37 @@ static int gen8_init_scratch(struct i915_address_space *vm) GEM_BUG_ON(!clone->has_read_only); vm->scratch_order = clone->scratch_order; - memcpy(vm->scratch, clone->scratch, sizeof(vm->scratch)); - px_dma(&vm->scratch[0]) = 0; /* no xfer of ownership */ + for (i = 0; i <= vm->top; i++) + vm->scratch[i] = i915_gem_object_get(clone->scratch[i]); + return 0; } - ret = setup_scratch_page(vm, __GFP_HIGHMEM); + ret = setup_scratch_page(vm); if (ret) return ret; - vm->scratch[0].encode = - gen8_pte_encode(px_dma(&vm->scratch[0]), + vm->scratch[0]->encode = + gen8_pte_encode(px_dma(vm->scratch[0]), I915_CACHE_LLC, vm->has_read_only); for (i = 1; i <= vm->top; i++) { - if (unlikely(setup_page_dma(vm, px_base(&vm->scratch[i])))) + struct drm_i915_gem_object *obj; + + obj = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(obj)) + goto free_scratch; + + ret = pin_pt_dma(vm, obj); + if (ret) { + i915_gem_object_put(obj); goto free_scratch; + } - fill_px(&vm->scratch[i], vm->scratch[i - 1].encode); - vm->scratch[i].encode = - gen8_pde_encode(px_dma(&vm->scratch[i]), - I915_CACHE_LLC); + fill_px(obj, vm->scratch[i - 1]->encode); + obj->encode = gen8_pde_encode(px_dma(obj), I915_CACHE_LLC); + + vm->scratch[i] = obj; } return 0; @@ -649,12 +619,20 @@ static int gen8_preallocate_top_level_pdp(struct i915_ppgtt *ppgtt) for (idx = 0; idx < GEN8_3LVL_PDPES; idx++) { struct i915_page_directory *pde; + int err; pde = alloc_pd(vm); if (IS_ERR(pde)) return PTR_ERR(pde); - fill_px(pde, vm->scratch[1].encode); + err = pin_pt_dma(vm, pde->pt.base); + if (err) { + i915_gem_object_put(pde->pt.base); + free_pd(vm, pde); + return err; + } + + fill_px(pde, vm->scratch[1]->encode); set_pd_entry(pd, idx, pde); atomic_inc(px_used(pde)); /* keep pinned */ } @@ -668,21 +646,32 @@ gen8_alloc_top_pd(struct i915_address_space *vm) { const unsigned int count = gen8_pd_top_count(vm); struct i915_page_directory *pd; + int err; - GEM_BUG_ON(count > ARRAY_SIZE(pd->entry)); + GEM_BUG_ON(count > I915_PDES); - pd = __alloc_pd(offsetof(typeof(*pd), entry[count])); + pd = __alloc_pd(count); if (unlikely(!pd)) return ERR_PTR(-ENOMEM); - if (unlikely(setup_page_dma(vm, px_base(pd)))) { - kfree(pd); - return ERR_PTR(-ENOMEM); + pd->pt.base = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(pd->pt.base)) { + err = PTR_ERR(pd->pt.base); + pd->pt.base = NULL; + goto err_pd; } - fill_page_dma(px_base(pd), vm->scratch[vm->top].encode, count); + err = pin_pt_dma(vm, pd->pt.base); + if (err) + goto err_pd; + + fill_page_dma(px_base(pd), vm->scratch[vm->top]->encode, count); atomic_inc(px_used(pd)); /* mark as pinned */ return pd; + +err_pd: + free_pd(vm, pd); + return ERR_PTR(err); } /* @@ -703,6 +692,7 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt) ppgtt_init(ppgtt, gt); ppgtt->vm.top = i915_vm_is_4lvl(&ppgtt->vm) ? 3 : 2; + ppgtt->vm.pd_shift = ilog2(SZ_4K * SZ_4K / sizeof(gen8_pte_t)); /* * From bdw, there is hw support for read-only pages in the PPGTT. @@ -714,12 +704,7 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt) */ ppgtt->vm.has_read_only = !IS_GEN_RANGE(gt->i915, 11, 12); - /* - * There are only few exceptions for gen >=6. chv and bxt. - * And we are not sure about the latter so play safe for now. - */ - if (IS_CHERRYVIEW(gt->i915) || IS_BROXTON(gt->i915)) - ppgtt->vm.pt_kmap_wc = true; + ppgtt->vm.alloc_pt_dma = alloc_pt_dma; err = gen8_init_scratch(&ppgtt->vm); if (err) diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index 91786310c114..d8b206e53660 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -28,6 +28,8 @@ #include "i915_drv.h" #include "i915_trace.h" +#include "intel_breadcrumbs.h" +#include "intel_context.h" #include "intel_gt_pm.h" #include "intel_gt_requests.h" @@ -53,33 +55,65 @@ static void irq_disable(struct intel_engine_cs *engine) spin_unlock(&engine->gt->irq_lock); } -static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b) +static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) { - struct intel_engine_cs *engine = - container_of(b, struct intel_engine_cs, breadcrumbs); + lockdep_assert_held(&b->irq_lock); + + if (!b->irq_engine || b->irq_armed) + return; + + if (!intel_gt_pm_get_if_awake(b->irq_engine->gt)) + return; + + /* + * The breadcrumb irq will be disarmed on the interrupt after the + * waiters are signaled. This gives us a single interrupt window in + * which we can add a new waiter and avoid the cost of re-enabling + * the irq. + */ + WRITE_ONCE(b->irq_armed, true); + + /* + * Since we are waiting on a request, the GPU should be busy + * and should have its own rpm reference. This is tracked + * by i915->gt.awake, we can forgo holding our own wakref + * for the interrupt as before i915->gt.awake is released (when + * the driver is idle) we disarm the breadcrumbs. + */ + if (!b->irq_enabled++) + irq_enable(b->irq_engine); +} + +static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b) +{ lockdep_assert_held(&b->irq_lock); + if (!b->irq_engine || !b->irq_armed) + return; + GEM_BUG_ON(!b->irq_enabled); if (!--b->irq_enabled) - irq_disable(engine); + irq_disable(b->irq_engine); WRITE_ONCE(b->irq_armed, false); - intel_gt_pm_put_async(engine->gt); + intel_gt_pm_put_async(b->irq_engine->gt); } -void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine) +static void add_signaling_context(struct intel_breadcrumbs *b, + struct intel_context *ce) { - struct intel_breadcrumbs *b = &engine->breadcrumbs; - unsigned long flags; - - if (!READ_ONCE(b->irq_armed)) - return; + intel_context_get(ce); + list_add_tail(&ce->signal_link, &b->signalers); + if (list_is_first(&ce->signal_link, &b->signalers)) + __intel_breadcrumbs_arm_irq(b); +} - spin_lock_irqsave(&b->irq_lock, flags); - if (b->irq_armed) - __intel_breadcrumbs_disarm_irq(b); - spin_unlock_irqrestore(&b->irq_lock, flags); +static void remove_signaling_context(struct intel_breadcrumbs *b, + struct intel_context *ce) +{ + list_del(&ce->signal_link); + intel_context_put(ce); } static inline bool __request_completed(const struct i915_request *rq) @@ -90,6 +124,9 @@ static inline bool __request_completed(const struct i915_request *rq) __maybe_unused static bool check_signal_order(struct intel_context *ce, struct i915_request *rq) { + if (rq->context != ce) + return false; + if (!list_is_last(&rq->signal_link, &ce->signals) && i915_seqno_passed(rq->fence.seqno, list_next_entry(rq, signal_link)->fence.seqno)) @@ -133,25 +170,21 @@ __dma_fence_signal__notify(struct dma_fence *fence, static void add_retire(struct intel_breadcrumbs *b, struct intel_timeline *tl) { - struct intel_engine_cs *engine = - container_of(b, struct intel_engine_cs, breadcrumbs); - - if (unlikely(intel_engine_is_virtual(engine))) - engine = intel_virtual_engine_get_sibling(engine, 0); - - intel_engine_add_retire(engine, tl); + if (b->irq_engine) + intel_engine_add_retire(b->irq_engine, tl); } -static void __signal_request(struct i915_request *rq, struct list_head *signals) +static bool __signal_request(struct i915_request *rq, struct list_head *signals) { - GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)); clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); - if (!__dma_fence_signal(&rq->fence)) - return; + if (!__dma_fence_signal(&rq->fence)) { + i915_request_put(rq); + return false; + } - i915_request_get(rq); list_add_tail(&rq->signal_link, signals); + return true; } static void signal_irq_work(struct irq_work *work) @@ -164,7 +197,7 @@ static void signal_irq_work(struct irq_work *work) spin_lock(&b->irq_lock); - if (b->irq_armed && list_empty(&b->signalers)) + if (list_empty(&b->signalers)) __intel_breadcrumbs_disarm_irq(b); list_splice_init(&b->signaled_requests, &signal); @@ -197,8 +230,8 @@ static void signal_irq_work(struct irq_work *work) /* Advance the list to the first incomplete request */ __list_del_many(&ce->signals, pos); if (&ce->signals == pos) { /* now empty */ - list_del_init(&ce->signal_link); add_retire(b, ce->timeline); + remove_signaling_context(b, ce); } } } @@ -220,116 +253,89 @@ static void signal_irq_work(struct irq_work *work) } } -static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) +struct intel_breadcrumbs * +intel_breadcrumbs_create(struct intel_engine_cs *irq_engine) { - struct intel_engine_cs *engine = - container_of(b, struct intel_engine_cs, breadcrumbs); - - lockdep_assert_held(&b->irq_lock); - if (b->irq_armed) - return true; - - if (!intel_gt_pm_get_if_awake(engine->gt)) - return false; - - /* - * The breadcrumb irq will be disarmed on the interrupt after the - * waiters are signaled. This gives us a single interrupt window in - * which we can add a new waiter and avoid the cost of re-enabling - * the irq. - */ - WRITE_ONCE(b->irq_armed, true); - - /* - * Since we are waiting on a request, the GPU should be busy - * and should have its own rpm reference. This is tracked - * by i915->gt.awake, we can forgo holding our own wakref - * for the interrupt as before i915->gt.awake is released (when - * the driver is idle) we disarm the breadcrumbs. - */ - - if (!b->irq_enabled++) - irq_enable(engine); + struct intel_breadcrumbs *b; - return true; -} - -void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) -{ - struct intel_breadcrumbs *b = &engine->breadcrumbs; + b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return NULL; spin_lock_init(&b->irq_lock); INIT_LIST_HEAD(&b->signalers); INIT_LIST_HEAD(&b->signaled_requests); init_irq_work(&b->irq_work, signal_irq_work); + + b->irq_engine = irq_engine; + + return b; } -void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) +void intel_breadcrumbs_reset(struct intel_breadcrumbs *b) { - struct intel_breadcrumbs *b = &engine->breadcrumbs; unsigned long flags; + if (!b->irq_engine) + return; + spin_lock_irqsave(&b->irq_lock, flags); if (b->irq_enabled) - irq_enable(engine); + irq_enable(b->irq_engine); else - irq_disable(engine); + irq_disable(b->irq_engine); spin_unlock_irqrestore(&b->irq_lock, flags); } -void intel_engine_transfer_stale_breadcrumbs(struct intel_engine_cs *engine, - struct intel_context *ce) +void intel_breadcrumbs_park(struct intel_breadcrumbs *b) { - struct intel_breadcrumbs *b = &engine->breadcrumbs; unsigned long flags; - spin_lock_irqsave(&b->irq_lock, flags); - if (!list_empty(&ce->signals)) { - struct i915_request *rq, *next; - - /* Queue for executing the signal callbacks in the irq_work */ - list_for_each_entry_safe(rq, next, &ce->signals, signal_link) { - GEM_BUG_ON(rq->engine != engine); - GEM_BUG_ON(!__request_completed(rq)); - - __signal_request(rq, &b->signaled_requests); - } + if (!READ_ONCE(b->irq_armed)) + return; - INIT_LIST_HEAD(&ce->signals); - list_del_init(&ce->signal_link); + spin_lock_irqsave(&b->irq_lock, flags); + __intel_breadcrumbs_disarm_irq(b); + spin_unlock_irqrestore(&b->irq_lock, flags); + if (!list_empty(&b->signalers)) irq_work_queue(&b->irq_work); - } - spin_unlock_irqrestore(&b->irq_lock, flags); } -void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine) +void intel_breadcrumbs_free(struct intel_breadcrumbs *b) { + kfree(b); } -bool i915_request_enable_breadcrumb(struct i915_request *rq) +static void insert_breadcrumb(struct i915_request *rq, + struct intel_breadcrumbs *b) { - lockdep_assert_held(&rq->lock); - - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) - return true; + struct intel_context *ce = rq->context; + struct list_head *pos; - if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) { - struct intel_breadcrumbs *b = &rq->engine->breadcrumbs; - struct intel_context *ce = rq->context; - struct list_head *pos; - - spin_lock(&b->irq_lock); + if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)) + return; - if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)) - goto unlock; + i915_request_get(rq); - if (!__intel_breadcrumbs_arm_irq(b)) - goto unlock; + /* + * If the request is already completed, we can transfer it + * straight onto a signaled list, and queue the irq worker for + * its signal completion. + */ + if (__request_completed(rq)) { + if (__signal_request(rq, &b->signaled_requests)) + irq_work_queue(&b->irq_work); + return; + } + if (list_empty(&ce->signals)) { + add_signaling_context(b, ce); + pos = &ce->signals; + } else { /* * We keep the seqno in retirement order, so we can break * inside intel_engine_signal_breadcrumbs as soon as we've @@ -351,24 +357,75 @@ bool i915_request_enable_breadcrumb(struct i915_request *rq) if (i915_seqno_passed(rq->fence.seqno, it->fence.seqno)) break; } - list_add(&rq->signal_link, pos); - if (pos == &ce->signals) /* catch transitions from empty list */ - list_move_tail(&ce->signal_link, &b->signalers); - GEM_BUG_ON(!check_signal_order(ce, rq)); + } + list_add(&rq->signal_link, pos); + GEM_BUG_ON(!check_signal_order(ce, rq)); + set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); + + /* Check after attaching to irq, interrupt may have already fired. */ + if (__request_completed(rq)) + irq_work_queue(&b->irq_work); +} - set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); -unlock: +bool i915_request_enable_breadcrumb(struct i915_request *rq) +{ + struct intel_breadcrumbs *b; + + /* Serialises with i915_request_retire() using rq->lock */ + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) + return true; + + /* + * Peek at i915_request_submit()/i915_request_unsubmit() status. + * + * If the request is not yet active (and not signaled), we will + * attach the breadcrumb later. + */ + if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) + return true; + + /* + * rq->engine is locked by rq->engine->active.lock. That however + * is not known until after rq->engine has been dereferenced and + * the lock acquired. Hence we acquire the lock and then validate + * that rq->engine still matches the lock we hold for it. + * + * Here, we are using the breadcrumb lock as a proxy for the + * rq->engine->active.lock, and we know that since the breadcrumb + * will be serialised within i915_request_submit/i915_request_unsubmit, + * the engine cannot change while active as long as we hold the + * breadcrumb lock on that engine. + * + * From the dma_fence_enable_signaling() path, we are outside of the + * request submit/unsubmit path, and so we must be more careful to + * acquire the right lock. + */ + b = READ_ONCE(rq->engine)->breadcrumbs; + spin_lock(&b->irq_lock); + while (unlikely(b != READ_ONCE(rq->engine)->breadcrumbs)) { spin_unlock(&b->irq_lock); + b = READ_ONCE(rq->engine)->breadcrumbs; + spin_lock(&b->irq_lock); } - return !__request_completed(rq); + /* + * Now that we are finally serialised with request submit/unsubmit, + * [with b->irq_lock] and with i915_request_retire() [via checking + * SIGNALED with rq->lock] confirm the request is indeed active. If + * it is no longer active, the breadcrumb will be attached upon + * i915_request_submit(). + */ + if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) + insert_breadcrumb(rq, b); + + spin_unlock(&b->irq_lock); + + return true; } void i915_request_cancel_breadcrumb(struct i915_request *rq) { - struct intel_breadcrumbs *b = &rq->engine->breadcrumbs; - - lockdep_assert_held(&rq->lock); + struct intel_breadcrumbs *b = rq->engine->breadcrumbs; /* * We must wait for b->irq_lock so that we know the interrupt handler @@ -382,23 +439,19 @@ void i915_request_cancel_breadcrumb(struct i915_request *rq) list_del(&rq->signal_link); if (list_empty(&ce->signals)) - list_del_init(&ce->signal_link); + remove_signaling_context(b, ce); clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); + i915_request_put(rq); } spin_unlock(&b->irq_lock); } -void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, - struct drm_printer *p) +static void print_signals(struct intel_breadcrumbs *b, struct drm_printer *p) { - struct intel_breadcrumbs *b = &engine->breadcrumbs; struct intel_context *ce; struct i915_request *rq; - if (list_empty(&b->signalers)) - return; - drm_printf(p, "Signals:\n"); spin_lock_irq(&b->irq_lock); @@ -414,3 +467,17 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, } spin_unlock_irq(&b->irq_lock); } + +void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, + struct drm_printer *p) +{ + struct intel_breadcrumbs *b; + + b = engine->breadcrumbs; + if (!b) + return; + + drm_printf(p, "IRQ: %s\n", enableddisabled(b->irq_armed)); + if (!list_empty(&b->signalers)) + print_signals(b, p); +} diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h new file mode 100644 index 000000000000..ed3d1deabfbd --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_BREADCRUMBS__ +#define __INTEL_BREADCRUMBS__ + +#include <linux/irq_work.h> + +#include "intel_engine_types.h" + +struct drm_printer; +struct i915_request; +struct intel_breadcrumbs; + +struct intel_breadcrumbs * +intel_breadcrumbs_create(struct intel_engine_cs *irq_engine); +void intel_breadcrumbs_free(struct intel_breadcrumbs *b); + +void intel_breadcrumbs_reset(struct intel_breadcrumbs *b); +void intel_breadcrumbs_park(struct intel_breadcrumbs *b); + +static inline void +intel_engine_signal_breadcrumbs(struct intel_engine_cs *engine) +{ + irq_work_queue(&engine->breadcrumbs->irq_work); +} + +void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, + struct drm_printer *p); + +bool i915_request_enable_breadcrumb(struct i915_request *request); +void i915_request_cancel_breadcrumb(struct i915_request *request); + +#endif /* __INTEL_BREADCRUMBS__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h new file mode 100644 index 000000000000..8e53b9942695 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_BREADCRUMBS_TYPES__ +#define __INTEL_BREADCRUMBS_TYPES__ + +#include <linux/irq_work.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/types.h> + +/* + * Rather than have every client wait upon all user interrupts, + * with the herd waking after every interrupt and each doing the + * heavyweight seqno dance, we delegate the task (of being the + * bottom-half of the user interrupt) to the first client. After + * every interrupt, we wake up one client, who does the heavyweight + * coherent seqno read and either goes back to sleep (if incomplete), + * or wakes up all the completed clients in parallel, before then + * transferring the bottom-half status to the next client in the queue. + * + * Compared to walking the entire list of waiters in a single dedicated + * bottom-half, we reduce the latency of the first waiter by avoiding + * a context switch, but incur additional coherent seqno reads when + * following the chain of request breadcrumbs. Since it is most likely + * that we have a single client waiting on each seqno, then reducing + * the overhead of waking that client is much preferred. + */ +struct intel_breadcrumbs { + spinlock_t irq_lock; /* protects the lists used in hardirq context */ + + /* Not all breadcrumbs are attached to physical HW */ + struct intel_engine_cs *irq_engine; + + struct list_head signalers; + struct list_head signaled_requests; + + struct irq_work irq_work; /* for use from inside irq_lock */ + + unsigned int irq_enabled; + + bool irq_armed; +}; + +#endif /* __INTEL_BREADCRUMBS_TYPES__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 52db2bde44a3..92a3f25c4006 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -93,57 +93,210 @@ static void intel_context_active_release(struct intel_context *ce) i915_active_release(&ce->active); } -int __intel_context_do_pin(struct intel_context *ce) +static int __context_pin_state(struct i915_vma *vma, struct i915_gem_ww_ctx *ww) +{ + unsigned int bias = i915_ggtt_pin_bias(vma) | PIN_OFFSET_BIAS; + int err; + + err = i915_ggtt_pin(vma, ww, 0, bias | PIN_HIGH); + if (err) + return err; + + err = i915_active_acquire(&vma->active); + if (err) + goto err_unpin; + + /* + * And mark it as a globally pinned object to let the shrinker know + * it cannot reclaim the object until we release it. + */ + i915_vma_make_unshrinkable(vma); + vma->obj->mm.dirty = true; + + return 0; + +err_unpin: + i915_vma_unpin(vma); + return err; +} + +static void __context_unpin_state(struct i915_vma *vma) +{ + i915_vma_make_shrinkable(vma); + i915_active_release(&vma->active); + __i915_vma_unpin(vma); +} + +static int __ring_active(struct intel_ring *ring, + struct i915_gem_ww_ctx *ww) +{ + int err; + + err = intel_ring_pin(ring, ww); + if (err) + return err; + + err = i915_active_acquire(&ring->vma->active); + if (err) + goto err_pin; + + return 0; + +err_pin: + intel_ring_unpin(ring); + return err; +} + +static void __ring_retire(struct intel_ring *ring) +{ + i915_active_release(&ring->vma->active); + intel_ring_unpin(ring); +} + +static int intel_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww) { int err; + CE_TRACE(ce, "active\n"); + + err = __ring_active(ce->ring, ww); + if (err) + return err; + + err = intel_timeline_pin(ce->timeline, ww); + if (err) + goto err_ring; + + if (!ce->state) + return 0; + + err = __context_pin_state(ce->state, ww); + if (err) + goto err_timeline; + + + return 0; + +err_timeline: + intel_timeline_unpin(ce->timeline); +err_ring: + __ring_retire(ce->ring); + return err; +} + +static void intel_context_post_unpin(struct intel_context *ce) +{ + if (ce->state) + __context_unpin_state(ce->state); + + intel_timeline_unpin(ce->timeline); + __ring_retire(ce->ring); +} + +int __intel_context_do_pin_ww(struct intel_context *ce, + struct i915_gem_ww_ctx *ww) +{ + bool handoff = false; + void *vaddr; + int err = 0; + if (unlikely(!test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) { err = intel_context_alloc_state(ce); if (err) return err; } - err = i915_active_acquire(&ce->active); + /* + * We always pin the context/ring/timeline here, to ensure a pin + * refcount for __intel_context_active(), which prevent a lock + * inversion of ce->pin_mutex vs dma_resv_lock(). + */ + + err = i915_gem_object_lock(ce->timeline->hwsp_ggtt->obj, ww); + if (!err && ce->ring->vma->obj) + err = i915_gem_object_lock(ce->ring->vma->obj, ww); + if (!err && ce->state) + err = i915_gem_object_lock(ce->state->obj, ww); + if (!err) + err = intel_context_pre_pin(ce, ww); if (err) return err; - if (mutex_lock_interruptible(&ce->pin_mutex)) { - err = -EINTR; - goto out_release; - } + err = i915_active_acquire(&ce->active); + if (err) + goto err_ctx_unpin; + + err = ce->ops->pre_pin(ce, ww, &vaddr); + if (err) + goto err_release; + + err = mutex_lock_interruptible(&ce->pin_mutex); + if (err) + goto err_post_unpin; if (unlikely(intel_context_is_closed(ce))) { err = -ENOENT; - goto out_unlock; + goto err_unlock; } if (likely(!atomic_add_unless(&ce->pin_count, 1, 0))) { err = intel_context_active_acquire(ce); if (unlikely(err)) - goto out_unlock; + goto err_unlock; - err = ce->ops->pin(ce); - if (unlikely(err)) - goto err_active; + err = ce->ops->pin(ce, vaddr); + if (err) { + intel_context_active_release(ce); + goto err_unlock; + } CE_TRACE(ce, "pin ring:{start:%08x, head:%04x, tail:%04x}\n", i915_ggtt_offset(ce->ring->vma), ce->ring->head, ce->ring->tail); + handoff = true; smp_mb__before_atomic(); /* flush pin before it is visible */ atomic_inc(&ce->pin_count); } GEM_BUG_ON(!intel_context_is_pinned(ce)); /* no overflow! */ - GEM_BUG_ON(i915_active_is_idle(&ce->active)); - goto out_unlock; -err_active: - intel_context_active_release(ce); -out_unlock: +err_unlock: mutex_unlock(&ce->pin_mutex); -out_release: +err_post_unpin: + if (!handoff) + ce->ops->post_unpin(ce); +err_release: i915_active_release(&ce->active); +err_ctx_unpin: + intel_context_post_unpin(ce); + + /* + * Unlock the hwsp_ggtt object since it's shared. + * In principle we can unlock all the global state locked above + * since it's pinned and doesn't need fencing, and will + * thus remain resident until it is explicitly unpinned. + */ + i915_gem_ww_unlock_single(ce->timeline->hwsp_ggtt->obj); + + return err; +} + +int __intel_context_do_pin(struct intel_context *ce) +{ + struct i915_gem_ww_ctx ww; + int err; + + i915_gem_ww_ctx_init(&ww, true); +retry: + err = __intel_context_do_pin_ww(ce, &ww); + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); return err; } @@ -154,6 +307,7 @@ void intel_context_unpin(struct intel_context *ce) CE_TRACE(ce, "unpin\n"); ce->ops->unpin(ce); + ce->ops->post_unpin(ce); /* * Once released, we may asynchronously drop the active reference. @@ -166,65 +320,6 @@ void intel_context_unpin(struct intel_context *ce) intel_context_put(ce); } -static int __context_pin_state(struct i915_vma *vma) -{ - unsigned int bias = i915_ggtt_pin_bias(vma) | PIN_OFFSET_BIAS; - int err; - - err = i915_ggtt_pin(vma, 0, bias | PIN_HIGH); - if (err) - return err; - - err = i915_active_acquire(&vma->active); - if (err) - goto err_unpin; - - /* - * And mark it as a globally pinned object to let the shrinker know - * it cannot reclaim the object until we release it. - */ - i915_vma_make_unshrinkable(vma); - vma->obj->mm.dirty = true; - - return 0; - -err_unpin: - i915_vma_unpin(vma); - return err; -} - -static void __context_unpin_state(struct i915_vma *vma) -{ - i915_vma_make_shrinkable(vma); - i915_active_release(&vma->active); - __i915_vma_unpin(vma); -} - -static int __ring_active(struct intel_ring *ring) -{ - int err; - - err = intel_ring_pin(ring); - if (err) - return err; - - err = i915_active_acquire(&ring->vma->active); - if (err) - goto err_pin; - - return 0; - -err_pin: - intel_ring_unpin(ring); - return err; -} - -static void __ring_retire(struct intel_ring *ring) -{ - i915_active_release(&ring->vma->active); - intel_ring_unpin(ring); -} - __i915_active_call static void __intel_context_retire(struct i915_active *active) { @@ -235,48 +330,29 @@ static void __intel_context_retire(struct i915_active *active) intel_context_get_avg_runtime_ns(ce)); set_bit(CONTEXT_VALID_BIT, &ce->flags); - if (ce->state) - __context_unpin_state(ce->state); - - intel_timeline_unpin(ce->timeline); - __ring_retire(ce->ring); - + intel_context_post_unpin(ce); intel_context_put(ce); } static int __intel_context_active(struct i915_active *active) { struct intel_context *ce = container_of(active, typeof(*ce), active); - int err; - - CE_TRACE(ce, "active\n"); intel_context_get(ce); - err = __ring_active(ce->ring); - if (err) - goto err_put; + /* everything should already be activated by intel_context_pre_pin() */ + GEM_WARN_ON(!i915_active_acquire_if_busy(&ce->ring->vma->active)); + __intel_ring_pin(ce->ring); - err = intel_timeline_pin(ce->timeline); - if (err) - goto err_ring; + __intel_timeline_pin(ce->timeline); - if (!ce->state) - return 0; - - err = __context_pin_state(ce->state); - if (err) - goto err_timeline; + if (ce->state) { + GEM_WARN_ON(!i915_active_acquire_if_busy(&ce->state->active)); + __i915_vma_pin(ce->state); + i915_vma_make_unshrinkable(ce->state); + } return 0; - -err_timeline: - intel_timeline_unpin(ce->timeline); -err_ring: - __ring_retire(ce->ring); -err_put: - intel_context_put(ce); - return err; } void @@ -382,15 +458,38 @@ int intel_context_prepare_remote_request(struct intel_context *ce, struct i915_request *intel_context_create_request(struct intel_context *ce) { + struct i915_gem_ww_ctx ww; struct i915_request *rq; int err; - err = intel_context_pin(ce); - if (unlikely(err)) - return ERR_PTR(err); + i915_gem_ww_ctx_init(&ww, true); +retry: + err = intel_context_pin_ww(ce, &ww); + if (!err) { + rq = i915_request_create(ce); + intel_context_unpin(ce); + } else if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + rq = ERR_PTR(err); + } else { + rq = ERR_PTR(err); + } + + i915_gem_ww_ctx_fini(&ww); - rq = i915_request_create(ce); - intel_context_unpin(ce); + if (IS_ERR(rq)) + return rq; + + /* + * timeline->mutex should be the inner lock, but is used as outer lock. + * Hack around this to shut up lockdep in selftests.. + */ + lockdep_unpin_lock(&ce->timeline->mutex, rq->cookie); + mutex_release(&ce->timeline->mutex.dep_map, _RET_IP_); + mutex_acquire(&ce->timeline->mutex.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); + rq->cookie = lockdep_pin_lock(&ce->timeline->mutex); return rq; } diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index 07be021882cc..fda2eba81e22 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -25,6 +25,8 @@ ##__VA_ARGS__); \ } while (0) +struct i915_gem_ww_ctx; + void intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine); void intel_context_fini(struct intel_context *ce); @@ -81,6 +83,8 @@ static inline void intel_context_unlock_pinned(struct intel_context *ce) } int __intel_context_do_pin(struct intel_context *ce); +int __intel_context_do_pin_ww(struct intel_context *ce, + struct i915_gem_ww_ctx *ww); static inline bool intel_context_pin_if_active(struct intel_context *ce) { @@ -95,6 +99,15 @@ static inline int intel_context_pin(struct intel_context *ce) return __intel_context_do_pin(ce); } +static inline int intel_context_pin_ww(struct intel_context *ce, + struct i915_gem_ww_ctx *ww) +{ + if (likely(intel_context_pin_if_active(ce))) + return 0; + + return __intel_context_do_pin_ww(ce, ww); +} + static inline void __intel_context_pin(struct intel_context *ce) { GEM_BUG_ON(!intel_context_is_pinned(ce)); diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 4954b0df4864..552cb57a2e8c 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -23,6 +23,7 @@ DECLARE_EWMA(runtime, 3, 8); struct i915_gem_context; +struct i915_gem_ww_ctx; struct i915_vma; struct intel_context; struct intel_ring; @@ -30,8 +31,10 @@ struct intel_ring; struct intel_context_ops { int (*alloc)(struct intel_context *ce); - int (*pin)(struct intel_context *ce); + int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, void **vaddr); + int (*pin)(struct intel_context *ce, void *vaddr); void (*unpin)(struct intel_context *ce); + void (*post_unpin)(struct intel_context *ce); void (*enter)(struct intel_context *ce); void (*exit)(struct intel_context *ce); diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index a9249a23903a..7c3a1012e702 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -223,26 +223,6 @@ void intel_engine_get_instdone(const struct intel_engine_cs *engine, void intel_engine_init_execlists(struct intel_engine_cs *engine); -void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine); -void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine); - -void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine); - -static inline void -intel_engine_signal_breadcrumbs(struct intel_engine_cs *engine) -{ - irq_work_queue(&engine->breadcrumbs.irq_work); -} - -void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine); -void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine); - -void intel_engine_transfer_stale_breadcrumbs(struct intel_engine_cs *engine, - struct intel_context *ce); - -void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, - struct drm_printer *p); - static inline u32 *__gen8_emit_pipe_control(u32 *batch, u32 flags0, u32 flags1, u32 offset) { memset(batch, 0, 6 * sizeof(u32)); @@ -357,4 +337,13 @@ intel_engine_has_preempt_reset(const struct intel_engine_cs *engine) return intel_engine_has_preemption(engine); } +static inline bool +intel_engine_has_heartbeat(const struct intel_engine_cs *engine) +{ + if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL)) + return false; + + return READ_ONCE(engine->props.heartbeat_interval_ms); +} + #endif /* _INTEL_RINGBUFFER_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 26087dd79782..5bfb5f7ed02c 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -28,6 +28,7 @@ #include "i915_drv.h" +#include "intel_breadcrumbs.h" #include "intel_context.h" #include "intel_engine.h" #include "intel_engine_pm.h" @@ -634,7 +635,7 @@ static int pin_ggtt_status_page(struct intel_engine_cs *engine, else flags = PIN_HIGH; - return i915_ggtt_pin(vma, 0, flags); + return i915_ggtt_pin(vma, NULL, 0, flags); } static int init_status_page(struct intel_engine_cs *engine) @@ -700,8 +701,13 @@ static int engine_setup_common(struct intel_engine_cs *engine) if (err) return err; + engine->breadcrumbs = intel_breadcrumbs_create(engine); + if (!engine->breadcrumbs) { + err = -ENOMEM; + goto err_status; + } + intel_engine_init_active(engine, ENGINE_PHYSICAL); - intel_engine_init_breadcrumbs(engine); intel_engine_init_execlists(engine); intel_engine_init_cmd_parser(engine); intel_engine_init__pm(engine); @@ -716,6 +722,10 @@ static int engine_setup_common(struct intel_engine_cs *engine) intel_engine_init_ctx_wa(engine); return 0; + +err_status: + cleanup_status_page(engine); + return err; } struct measure_breadcrumb { @@ -785,9 +795,11 @@ intel_engine_init_active(struct intel_engine_cs *engine, unsigned int subclass) } static struct intel_context * -create_kernel_context(struct intel_engine_cs *engine) +create_pinned_context(struct intel_engine_cs *engine, + unsigned int hwsp, + struct lock_class_key *key, + const char *name) { - static struct lock_class_key kernel; struct intel_context *ce; int err; @@ -796,6 +808,7 @@ create_kernel_context(struct intel_engine_cs *engine) return ce; __set_bit(CONTEXT_BARRIER_BIT, &ce->flags); + ce->timeline = page_pack_bits(NULL, hwsp); err = intel_context_pin(ce); /* perma-pin so it is always available */ if (err) { @@ -809,11 +822,20 @@ create_kernel_context(struct intel_engine_cs *engine) * should we need to inject GPU operations during their request * construction. */ - lockdep_set_class(&ce->timeline->mutex, &kernel); + lockdep_set_class_and_name(&ce->timeline->mutex, key, name); return ce; } +static struct intel_context * +create_kernel_context(struct intel_engine_cs *engine) +{ + static struct lock_class_key kernel; + + return create_pinned_context(engine, I915_GEM_HWS_SEQNO_ADDR, + &kernel, "kernel_context"); +} + /** * intel_engines_init_common - initialize cengine state which might require hw access * @engine: Engine to initialize. @@ -902,9 +924,9 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine) tasklet_kill(&engine->execlists.tasklet); /* flush the callback */ cleanup_status_page(engine); + intel_breadcrumbs_free(engine->breadcrumbs); intel_engine_fini_retire(engine); - intel_engine_fini_breadcrumbs(engine); intel_engine_cleanup_cmd_parser(engine); if (engine->default_state) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c index 8ffdf676c0a0..5067d0524d4b 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c @@ -177,36 +177,82 @@ void intel_engine_init_heartbeat(struct intel_engine_cs *engine) INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); } +static int __intel_engine_pulse(struct intel_engine_cs *engine) +{ + struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; + struct intel_context *ce = engine->kernel_context; + struct i915_request *rq; + + lockdep_assert_held(&ce->timeline->mutex); + GEM_BUG_ON(!intel_engine_has_preemption(engine)); + GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); + + intel_context_enter(ce); + rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); + intel_context_exit(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); + idle_pulse(engine, rq); + + __i915_request_commit(rq); + __i915_request_queue(rq, &attr); + GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); + + return 0; +} + +static unsigned long set_heartbeat(struct intel_engine_cs *engine, + unsigned long delay) +{ + unsigned long old; + + old = xchg(&engine->props.heartbeat_interval_ms, delay); + if (delay) + intel_engine_unpark_heartbeat(engine); + else + intel_engine_park_heartbeat(engine); + + return old; +} + int intel_engine_set_heartbeat(struct intel_engine_cs *engine, unsigned long delay) { - int err; + struct intel_context *ce = engine->kernel_context; + int err = 0; - /* Send one last pulse before to cleanup persistent hogs */ - if (!delay && IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT)) { - err = intel_engine_pulse(engine); - if (err) - return err; - } + if (!delay && !intel_engine_has_preempt_reset(engine)) + return -ENODEV; + + intel_engine_pm_get(engine); + + err = mutex_lock_interruptible(&ce->timeline->mutex); + if (err) + goto out_rpm; - WRITE_ONCE(engine->props.heartbeat_interval_ms, delay); + if (delay != engine->props.heartbeat_interval_ms) { + unsigned long saved = set_heartbeat(engine, delay); - if (intel_engine_pm_get_if_awake(engine)) { - if (delay) - intel_engine_unpark_heartbeat(engine); - else - intel_engine_park_heartbeat(engine); - intel_engine_pm_put(engine); + /* recheck current execution */ + if (intel_engine_has_preemption(engine)) { + err = __intel_engine_pulse(engine); + if (err) + set_heartbeat(engine, saved); + } } - return 0; + mutex_unlock(&ce->timeline->mutex); + +out_rpm: + intel_engine_pm_put(engine); + return err; } int intel_engine_pulse(struct intel_engine_cs *engine) { - struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; struct intel_context *ce = engine->kernel_context; - struct i915_request *rq; int err; if (!intel_engine_has_preemption(engine)) @@ -215,30 +261,12 @@ int intel_engine_pulse(struct intel_engine_cs *engine) if (!intel_engine_pm_get_if_awake(engine)) return 0; - if (mutex_lock_interruptible(&ce->timeline->mutex)) { - err = -EINTR; - goto out_rpm; - } - - intel_context_enter(ce); - rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); - intel_context_exit(ce); - if (IS_ERR(rq)) { - err = PTR_ERR(rq); - goto out_unlock; + err = -EINTR; + if (!mutex_lock_interruptible(&ce->timeline->mutex)) { + err = __intel_engine_pulse(engine); + mutex_unlock(&ce->timeline->mutex); } - __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); - idle_pulse(engine, rq); - - __i915_request_commit(rq); - __i915_request_queue(rq, &attr); - GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); - err = 0; - -out_unlock: - mutex_unlock(&ce->timeline->mutex); -out_rpm: intel_engine_pm_put(engine); return err; } diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index 8ec3eecf3e39..f7b2e07e2229 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -6,6 +6,7 @@ #include "i915_drv.h" +#include "intel_breadcrumbs.h" #include "intel_context.h" #include "intel_engine.h" #include "intel_engine_heartbeat.h" @@ -247,7 +248,7 @@ static int __engine_park(struct intel_wakeref *wf) call_idle_barriers(engine); /* cleanup after wedging */ intel_engine_park_heartbeat(engine); - intel_engine_disarm_breadcrumbs(engine); + intel_breadcrumbs_park(engine->breadcrumbs); /* Must be reset upon idling, or we may miss the busy wakeup. */ GEM_BUG_ON(engine->execlists.queue_priority_hint != INT_MIN); diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index 8de92fd7d392..c400aaa2287b 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -22,6 +22,7 @@ #include "i915_pmu.h" #include "i915_priolist_types.h" #include "i915_selftest.h" +#include "intel_breadcrumbs_types.h" #include "intel_sseu.h" #include "intel_timeline_types.h" #include "intel_uncore.h" @@ -373,34 +374,8 @@ struct intel_engine_cs { */ struct ewma__engine_latency latency; - /* Rather than have every client wait upon all user interrupts, - * with the herd waking after every interrupt and each doing the - * heavyweight seqno dance, we delegate the task (of being the - * bottom-half of the user interrupt) to the first client. After - * every interrupt, we wake up one client, who does the heavyweight - * coherent seqno read and either goes back to sleep (if incomplete), - * or wakes up all the completed clients in parallel, before then - * transferring the bottom-half status to the next client in the queue. - * - * Compared to walking the entire list of waiters in a single dedicated - * bottom-half, we reduce the latency of the first waiter by avoiding - * a context switch, but incur additional coherent seqno reads when - * following the chain of request breadcrumbs. Since it is most likely - * that we have a single client waiting on each seqno, then reducing - * the overhead of waking that client is much preferred. - */ - struct intel_breadcrumbs { - spinlock_t irq_lock; - struct list_head signalers; - - struct list_head signaled_requests; - - struct irq_work irq_work; /* for use from inside irq_lock */ - - unsigned int irq_enabled; - - bool irq_armed; - } breadcrumbs; + /* Keep track of all the seqno used, a trail of breadcrumbs */ + struct intel_breadcrumbs *breadcrumbs; struct intel_engine_pmu { /** diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index 99e28d9021e8..81c05f551b9c 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -78,8 +78,6 @@ int i915_ggtt_init_hw(struct drm_i915_private *i915) { int ret; - stash_init(&i915->mm.wc_stash); - /* * Note that we use page colouring to enforce a guard page at the * end of the address space. This is required as the CS may prefetch @@ -232,7 +230,7 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm, /* Fill the allocated but "unused" space beyond the end of the buffer */ while (gte < end) - gen8_set_pte(gte++, vm->scratch[0].encode); + gen8_set_pte(gte++, vm->scratch[0]->encode); /* * We want to flush the TLBs only after we're certain all the PTE @@ -283,7 +281,7 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm, /* Fill the allocated but "unused" space beyond the end of the buffer */ while (gte < end) - iowrite32(vm->scratch[0].encode, gte++); + iowrite32(vm->scratch[0]->encode, gte++); /* * We want to flush the TLBs only after we're certain all the PTE @@ -303,7 +301,7 @@ static void gen8_ggtt_clear_range(struct i915_address_space *vm, struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); unsigned int first_entry = start / I915_GTT_PAGE_SIZE; unsigned int num_entries = length / I915_GTT_PAGE_SIZE; - const gen8_pte_t scratch_pte = vm->scratch[0].encode; + const gen8_pte_t scratch_pte = vm->scratch[0]->encode; gen8_pte_t __iomem *gtt_base = (gen8_pte_t __iomem *)ggtt->gsm + first_entry; const int max_entries = ggtt_total_entries(ggtt) - first_entry; @@ -401,7 +399,7 @@ static void gen6_ggtt_clear_range(struct i915_address_space *vm, first_entry, num_entries, max_entries)) num_entries = max_entries; - scratch_pte = vm->scratch[0].encode; + scratch_pte = vm->scratch[0]->encode; for (i = 0; i < num_entries; i++) iowrite32(scratch_pte, >t_base[i]); } @@ -436,16 +434,17 @@ static void i915_ggtt_clear_range(struct i915_address_space *vm, intel_gtt_clear_range(start >> PAGE_SHIFT, length >> PAGE_SHIFT); } -static int ggtt_bind_vma(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags) +static void ggtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) { struct drm_i915_gem_object *obj = vma->obj; u32 pte_flags; if (i915_vma_is_bound(vma, ~flags & I915_VMA_BIND_MASK)) - return 0; + return; /* Applicable to VLV (gen8+ do not support RO in the GGTT) */ pte_flags = 0; @@ -454,8 +453,6 @@ static int ggtt_bind_vma(struct i915_address_space *vm, vm->insert_entries(vm, vma, cache_level, pte_flags); vma->page_sizes.gtt = I915_GTT_PAGE_SIZE; - - return 0; } static void ggtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma) @@ -568,31 +565,25 @@ err: return ret; } -static int aliasing_gtt_bind_vma(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags) +static void aliasing_gtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) { u32 pte_flags; - int ret; /* Currently applicable only to VLV */ pte_flags = 0; if (i915_gem_object_is_readonly(vma->obj)) pte_flags |= PTE_READ_ONLY; - if (flags & I915_VMA_LOCAL_BIND) { - struct i915_ppgtt *alias = i915_vm_to_ggtt(vm)->alias; - - ret = ppgtt_bind_vma(&alias->vm, vma, cache_level, flags); - if (ret) - return ret; - } + if (flags & I915_VMA_LOCAL_BIND) + ppgtt_bind_vma(&i915_vm_to_ggtt(vm)->alias->vm, + stash, vma, cache_level, flags); if (flags & I915_VMA_GLOBAL_BIND) vm->insert_entries(vm, vma, cache_level, pte_flags); - - return 0; } static void aliasing_gtt_unbind_vma(struct i915_address_space *vm, @@ -607,6 +598,7 @@ static void aliasing_gtt_unbind_vma(struct i915_address_space *vm, static int init_aliasing_ppgtt(struct i915_ggtt *ggtt) { + struct i915_vm_pt_stash stash = {}; struct i915_ppgtt *ppgtt; int err; @@ -619,15 +611,21 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt) goto err_ppgtt; } + err = i915_vm_alloc_pt_stash(&ppgtt->vm, &stash, ggtt->vm.total); + if (err) + goto err_ppgtt; + + err = i915_vm_pin_pt_stash(&ppgtt->vm, &stash); + if (err) + goto err_stash; + /* * Note we only pre-allocate as far as the end of the global * GTT. On 48b / 4-level page-tables, the difference is very, * very significant! We have to preallocate as GVT/vgpu does * not like the page directory disappearing. */ - err = ppgtt->vm.allocate_va_range(&ppgtt->vm, 0, ggtt->vm.total); - if (err) - goto err_ppgtt; + ppgtt->vm.allocate_va_range(&ppgtt->vm, &stash, 0, ggtt->vm.total); ggtt->alias = ppgtt; ggtt->vm.bind_async_flags |= ppgtt->vm.bind_async_flags; @@ -638,8 +636,11 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt) GEM_BUG_ON(ggtt->vm.vma_ops.unbind_vma != ggtt_unbind_vma); ggtt->vm.vma_ops.unbind_vma = aliasing_gtt_unbind_vma; + i915_vm_free_pt_stash(&ppgtt->vm, &stash); return 0; +err_stash: + i915_vm_free_pt_stash(&ppgtt->vm, &stash); err_ppgtt: i915_vm_put(&ppgtt->vm); return err; @@ -715,18 +716,11 @@ static void ggtt_cleanup_hw(struct i915_ggtt *ggtt) void i915_ggtt_driver_release(struct drm_i915_private *i915) { struct i915_ggtt *ggtt = &i915->ggtt; - struct pagevec *pvec; fini_aliasing_ppgtt(ggtt); intel_ggtt_fini_fences(ggtt); ggtt_cleanup_hw(ggtt); - - pvec = &i915->mm.wc_stash.pvec; - if (pvec->nr) { - set_pages_array_wb(pvec->pages, pvec->nr); - __pagevec_release(pvec); - } } static unsigned int gen6_get_total_gtt_size(u16 snb_gmch_ctl) @@ -789,7 +783,7 @@ static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size) return -ENOMEM; } - ret = setup_scratch_page(&ggtt->vm, GFP_DMA32); + ret = setup_scratch_page(&ggtt->vm); if (ret) { drm_err(&i915->drm, "Scratch setup failed\n"); /* iounmap will also get called at remove, but meh */ @@ -797,8 +791,8 @@ static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size) return ret; } - ggtt->vm.scratch[0].encode = - ggtt->vm.pte_encode(px_dma(&ggtt->vm.scratch[0]), + ggtt->vm.scratch[0]->encode = + ggtt->vm.pte_encode(px_dma(ggtt->vm.scratch[0]), I915_CACHE_NONE, 0); return 0; @@ -824,7 +818,7 @@ static void gen6_gmch_remove(struct i915_address_space *vm) struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); iounmap(ggtt->gsm); - cleanup_scratch_page(vm); + free_scratch(vm); } static struct resource pci_resource(struct pci_dev *pdev, int bar) @@ -852,6 +846,8 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt) else size = gen8_get_total_gtt_size(snb_gmch_ctl); + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + ggtt->vm.total = (size / sizeof(gen8_pte_t)) * I915_GTT_PAGE_SIZE; ggtt->vm.cleanup = gen6_gmch_remove; ggtt->vm.insert_page = gen8_ggtt_insert_page; @@ -1000,6 +996,8 @@ static int gen6_gmch_probe(struct i915_ggtt *ggtt) size = gen6_get_total_gtt_size(snb_gmch_ctl); ggtt->vm.total = (size / sizeof(gen6_pte_t)) * I915_GTT_PAGE_SIZE; + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + ggtt->vm.clear_range = nop_clear_range; if (!HAS_FULL_PPGTT(i915) || intel_scanout_needs_vtd_wa(i915)) ggtt->vm.clear_range = gen6_ggtt_clear_range; @@ -1050,6 +1048,8 @@ static int i915_gmch_probe(struct i915_ggtt *ggtt) ggtt->gmadr = (struct resource)DEFINE_RES_MEM(gmadr_base, ggtt->mappable_end); + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + ggtt->do_idle_maps = needs_idle_maps(i915); ggtt->vm.insert_page = i915_ggtt_insert_page; ggtt->vm.insert_entries = i915_ggtt_insert_entries; @@ -1165,11 +1165,6 @@ void i915_ggtt_disable_guc(struct i915_ggtt *ggtt) ggtt->invalidate(ggtt); } -static unsigned int clear_bind(struct i915_vma *vma) -{ - return atomic_fetch_and(~I915_VMA_BIND_MASK, &vma->flags); -} - void i915_ggtt_resume(struct i915_ggtt *ggtt) { struct i915_vma *vma; @@ -1187,11 +1182,13 @@ void i915_ggtt_resume(struct i915_ggtt *ggtt) /* clflush objects bound into the GGTT and rebind them. */ list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link) { struct drm_i915_gem_object *obj = vma->obj; - unsigned int was_bound = clear_bind(vma); + unsigned int was_bound = + atomic_read(&vma->flags) & I915_VMA_BIND_MASK; - WARN_ON(i915_vma_bind(vma, - obj ? obj->cache_level : 0, - was_bound, NULL)); + GEM_BUG_ON(!was_bound); + vma->ops->bind_vma(&ggtt->vm, NULL, vma, + obj ? obj->cache_level : 0, + was_bound); if (obj) { /* only used during resume => exclusive access */ flush |= fetch_and_zero(&obj->write_domain); obj->read_domains |= I915_GEM_DOMAIN_GTT; diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index e0755f1a904b..39b428c5049c 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -356,7 +356,7 @@ static int intel_gt_init_scratch(struct intel_gt *gt, unsigned int size) goto err_unref; } - ret = i915_ggtt_pin(vma, 0, PIN_HIGH); + ret = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); if (ret) goto err_unref; @@ -406,21 +406,20 @@ static int __engines_record_defaults(struct intel_gt *gt) /* We must be able to switch to something! */ GEM_BUG_ON(!engine->kernel_context); - err = intel_renderstate_init(&so, engine); - if (err) - goto out; - ce = intel_context_create(engine); if (IS_ERR(ce)) { err = PTR_ERR(ce); goto out; } - rq = intel_context_create_request(ce); + err = intel_renderstate_init(&so, ce); + if (err) + goto err; + + rq = i915_request_create(ce); if (IS_ERR(rq)) { err = PTR_ERR(rq); - intel_context_put(ce); - goto out; + goto err_fini; } err = intel_engine_emit_ctx_wa(rq); @@ -434,9 +433,13 @@ static int __engines_record_defaults(struct intel_gt *gt) err_rq: requests[id] = i915_request_get(rq); i915_request_add(rq); - intel_renderstate_fini(&so); - if (err) +err_fini: + intel_renderstate_fini(&so, ce); +err: + if (err) { + intel_context_put(ce); goto out; + } } /* Flush the default context image to memory, and enable powersaving. */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c index 418ae184cecf..104cb30e8c13 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c @@ -35,39 +35,65 @@ static void node_free(struct intel_gt_buffer_pool_node *node) { i915_gem_object_put(node->obj); i915_active_fini(&node->active); - kfree(node); + kfree_rcu(node, rcu); } -static void pool_free_work(struct work_struct *wrk) +static bool pool_free_older_than(struct intel_gt_buffer_pool *pool, long keep) { - struct intel_gt_buffer_pool *pool = - container_of(wrk, typeof(*pool), work.work); - struct intel_gt_buffer_pool_node *node, *next; - unsigned long old = jiffies - HZ; + struct intel_gt_buffer_pool_node *node, *stale = NULL; bool active = false; - LIST_HEAD(stale); int n; /* Free buffers that have not been used in the past second */ - spin_lock_irq(&pool->lock); for (n = 0; n < ARRAY_SIZE(pool->cache_list); n++) { struct list_head *list = &pool->cache_list[n]; - /* Most recent at head; oldest at tail */ - list_for_each_entry_safe_reverse(node, next, list, link) { - if (time_before(node->age, old)) - break; + if (list_empty(list)) + continue; + + if (spin_trylock_irq(&pool->lock)) { + struct list_head *pos; + + /* Most recent at head; oldest at tail */ + list_for_each_prev(pos, list) { + unsigned long age; + + node = list_entry(pos, typeof(*node), link); + + age = READ_ONCE(node->age); + if (!age || jiffies - age < keep) + break; + + /* Check we are the first to claim this node */ + if (!xchg(&node->age, 0)) + break; - list_move(&node->link, &stale); + node->free = stale; + stale = node; + } + if (!list_is_last(pos, list)) + __list_del_many(pos, list); + + spin_unlock_irq(&pool->lock); } + active |= !list_empty(list); } - spin_unlock_irq(&pool->lock); - list_for_each_entry_safe(node, next, &stale, link) + while ((node = stale)) { + stale = stale->free; node_free(node); + } + + return active; +} + +static void pool_free_work(struct work_struct *wrk) +{ + struct intel_gt_buffer_pool *pool = + container_of(wrk, typeof(*pool), work.work); - if (active) + if (pool_free_older_than(pool, HZ)) schedule_delayed_work(&pool->work, round_jiffies_up_relative(HZ)); } @@ -108,9 +134,10 @@ static void pool_retire(struct i915_active *ref) /* Return this object to the shrinker pool */ i915_gem_object_make_purgeable(node->obj); + GEM_BUG_ON(node->age); spin_lock_irqsave(&pool->lock, flags); - node->age = jiffies; - list_add(&node->link, list); + list_add_rcu(&node->link, list); + WRITE_ONCE(node->age, jiffies ?: 1); /* 0 reserved for active nodes */ spin_unlock_irqrestore(&pool->lock, flags); schedule_delayed_work(&pool->work, @@ -129,6 +156,7 @@ node_create(struct intel_gt_buffer_pool *pool, size_t sz) if (!node) return ERR_PTR(-ENOMEM); + node->age = 0; node->pool = pool; i915_active_init(&node->active, pool_active, pool_retire); @@ -151,20 +179,30 @@ intel_gt_get_buffer_pool(struct intel_gt *gt, size_t size) struct intel_gt_buffer_pool *pool = >->buffer_pool; struct intel_gt_buffer_pool_node *node; struct list_head *list; - unsigned long flags; int ret; size = PAGE_ALIGN(size); list = bucket_for_size(pool, size); - spin_lock_irqsave(&pool->lock, flags); - list_for_each_entry(node, list, link) { + rcu_read_lock(); + list_for_each_entry_rcu(node, list, link) { + unsigned long age; + if (node->obj->base.size < size) continue; - list_del(&node->link); - break; + + age = READ_ONCE(node->age); + if (!age) + continue; + + if (cmpxchg(&node->age, age, 0) == age) { + spin_lock_irq(&pool->lock); + list_del_rcu(&node->link); + spin_unlock_irq(&pool->lock); + break; + } } - spin_unlock_irqrestore(&pool->lock, flags); + rcu_read_unlock(); if (&node->link == list) { node = node_create(pool, size); @@ -192,28 +230,13 @@ void intel_gt_init_buffer_pool(struct intel_gt *gt) INIT_DELAYED_WORK(&pool->work, pool_free_work); } -static void pool_free_imm(struct intel_gt_buffer_pool *pool) -{ - int n; - - spin_lock_irq(&pool->lock); - for (n = 0; n < ARRAY_SIZE(pool->cache_list); n++) { - struct intel_gt_buffer_pool_node *node, *next; - struct list_head *list = &pool->cache_list[n]; - - list_for_each_entry_safe(node, next, list, link) - node_free(node); - INIT_LIST_HEAD(list); - } - spin_unlock_irq(&pool->lock); -} - void intel_gt_flush_buffer_pool(struct intel_gt *gt) { struct intel_gt_buffer_pool *pool = >->buffer_pool; do { - pool_free_imm(pool); + while (pool_free_older_than(pool, 0)) + ; } while (cancel_delayed_work_sync(&pool->work)); } diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h index e28bdda771ed..bcf1658c9633 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h @@ -25,7 +25,11 @@ struct intel_gt_buffer_pool_node { struct i915_active active; struct drm_i915_gem_object *obj; struct list_head link; - struct intel_gt_buffer_pool *pool; + union { + struct intel_gt_buffer_pool *pool; + struct intel_gt_buffer_pool_node *free; + struct rcu_head rcu; + }; unsigned long age; }; diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c index b05da68e52f4..257063a57101 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c @@ -8,6 +8,7 @@ #include "i915_drv.h" #include "i915_irq.h" +#include "intel_breadcrumbs.h" #include "intel_gt.h" #include "intel_gt_irq.h" #include "intel_uncore.h" diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 2a72cce63fd9..3f1114b58b01 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -11,160 +11,24 @@ #include "intel_gt.h" #include "intel_gtt.h" -void stash_init(struct pagestash *stash) +struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz) { - pagevec_init(&stash->pvec); - spin_lock_init(&stash->lock); -} - -static struct page *stash_pop_page(struct pagestash *stash) -{ - struct page *page = NULL; - - spin_lock(&stash->lock); - if (likely(stash->pvec.nr)) - page = stash->pvec.pages[--stash->pvec.nr]; - spin_unlock(&stash->lock); - - return page; -} - -static void stash_push_pagevec(struct pagestash *stash, struct pagevec *pvec) -{ - unsigned int nr; - - spin_lock_nested(&stash->lock, SINGLE_DEPTH_NESTING); - - nr = min_t(typeof(nr), pvec->nr, pagevec_space(&stash->pvec)); - memcpy(stash->pvec.pages + stash->pvec.nr, - pvec->pages + pvec->nr - nr, - sizeof(pvec->pages[0]) * nr); - stash->pvec.nr += nr; - - spin_unlock(&stash->lock); - - pvec->nr -= nr; -} - -static struct page *vm_alloc_page(struct i915_address_space *vm, gfp_t gfp) -{ - struct pagevec stack; - struct page *page; - if (I915_SELFTEST_ONLY(should_fail(&vm->fault_attr, 1))) i915_gem_shrink_all(vm->i915); - page = stash_pop_page(&vm->free_pages); - if (page) - return page; - - if (!vm->pt_kmap_wc) - return alloc_page(gfp); - - /* Look in our global stash of WC pages... */ - page = stash_pop_page(&vm->i915->mm.wc_stash); - if (page) - return page; - - /* - * Otherwise batch allocate pages to amortize cost of set_pages_wc. - * - * We have to be careful as page allocation may trigger the shrinker - * (via direct reclaim) which will fill up the WC stash underneath us. - * So we add our WB pages into a temporary pvec on the stack and merge - * them into the WC stash after all the allocations are complete. - */ - pagevec_init(&stack); - do { - struct page *page; - - page = alloc_page(gfp); - if (unlikely(!page)) - break; - - stack.pages[stack.nr++] = page; - } while (pagevec_space(&stack)); - - if (stack.nr && !set_pages_array_wc(stack.pages, stack.nr)) { - page = stack.pages[--stack.nr]; - - /* Merge spare WC pages to the global stash */ - if (stack.nr) - stash_push_pagevec(&vm->i915->mm.wc_stash, &stack); - - /* Push any surplus WC pages onto the local VM stash */ - if (stack.nr) - stash_push_pagevec(&vm->free_pages, &stack); - } - - /* Return unwanted leftovers */ - if (unlikely(stack.nr)) { - WARN_ON_ONCE(set_pages_array_wb(stack.pages, stack.nr)); - __pagevec_release(&stack); - } - - return page; + return i915_gem_object_create_internal(vm->i915, sz); } -static void vm_free_pages_release(struct i915_address_space *vm, - bool immediate) +int pin_pt_dma(struct i915_address_space *vm, struct drm_i915_gem_object *obj) { - struct pagevec *pvec = &vm->free_pages.pvec; - struct pagevec stack; - - lockdep_assert_held(&vm->free_pages.lock); - GEM_BUG_ON(!pagevec_count(pvec)); - - if (vm->pt_kmap_wc) { - /* - * When we use WC, first fill up the global stash and then - * only if full immediately free the overflow. - */ - stash_push_pagevec(&vm->i915->mm.wc_stash, pvec); - - /* - * As we have made some room in the VM's free_pages, - * we can wait for it to fill again. Unless we are - * inside i915_address_space_fini() and must - * immediately release the pages! - */ - if (pvec->nr <= (immediate ? 0 : PAGEVEC_SIZE - 1)) - return; + int err; - /* - * We have to drop the lock to allow ourselves to sleep, - * so take a copy of the pvec and clear the stash for - * others to use it as we sleep. - */ - stack = *pvec; - pagevec_reinit(pvec); - spin_unlock(&vm->free_pages.lock); - - pvec = &stack; - set_pages_array_wb(pvec->pages, pvec->nr); - - spin_lock(&vm->free_pages.lock); - } + err = i915_gem_object_pin_pages(obj); + if (err) + return err; - __pagevec_release(pvec); -} - -static void vm_free_page(struct i915_address_space *vm, struct page *page) -{ - /* - * On !llc, we need to change the pages back to WB. We only do so - * in bulk, so we rarely need to change the page attributes here, - * but doing so requires a stop_machine() from deep inside arch/x86/mm. - * To make detection of the possible sleep more likely, use an - * unconditional might_sleep() for everybody. - */ - might_sleep(); - spin_lock(&vm->free_pages.lock); - while (!pagevec_space(&vm->free_pages.pvec)) - vm_free_pages_release(vm, false); - GEM_BUG_ON(pagevec_count(&vm->free_pages.pvec) >= PAGEVEC_SIZE); - pagevec_add(&vm->free_pages.pvec, page); - spin_unlock(&vm->free_pages.lock); + i915_gem_object_make_unshrinkable(obj); + return 0; } void __i915_vm_close(struct i915_address_space *vm) @@ -194,14 +58,7 @@ void __i915_vm_close(struct i915_address_space *vm) void i915_address_space_fini(struct i915_address_space *vm) { - spin_lock(&vm->free_pages.lock); - if (pagevec_count(&vm->free_pages.pvec)) - vm_free_pages_release(vm, true); - GEM_BUG_ON(pagevec_count(&vm->free_pages.pvec)); - spin_unlock(&vm->free_pages.lock); - drm_mm_takedown(&vm->mm); - mutex_destroy(&vm->mutex); } @@ -246,8 +103,6 @@ void i915_address_space_init(struct i915_address_space *vm, int subclass) drm_mm_init(&vm->mm, 0, vm->total); vm->mm.head_node.color = I915_COLOR_UNEVICTABLE; - stash_init(&vm->free_pages); - INIT_LIST_HEAD(&vm->bound_list); } @@ -264,64 +119,50 @@ void clear_pages(struct i915_vma *vma) memset(&vma->page_sizes, 0, sizeof(vma->page_sizes)); } -static int __setup_page_dma(struct i915_address_space *vm, - struct i915_page_dma *p, - gfp_t gfp) -{ - p->page = vm_alloc_page(vm, gfp | I915_GFP_ALLOW_FAIL); - if (unlikely(!p->page)) - return -ENOMEM; - - p->daddr = dma_map_page_attrs(vm->dma, - p->page, 0, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL, - DMA_ATTR_SKIP_CPU_SYNC | - DMA_ATTR_NO_WARN); - if (unlikely(dma_mapping_error(vm->dma, p->daddr))) { - vm_free_page(vm, p->page); - return -ENOMEM; - } - - return 0; -} - -int setup_page_dma(struct i915_address_space *vm, struct i915_page_dma *p) +dma_addr_t __px_dma(struct drm_i915_gem_object *p) { - return __setup_page_dma(vm, p, __GFP_HIGHMEM); + GEM_BUG_ON(!i915_gem_object_has_pages(p)); + return sg_dma_address(p->mm.pages->sgl); } -void cleanup_page_dma(struct i915_address_space *vm, struct i915_page_dma *p) +struct page *__px_page(struct drm_i915_gem_object *p) { - dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); - vm_free_page(vm, p->page); + GEM_BUG_ON(!i915_gem_object_has_pages(p)); + return sg_page(p->mm.pages->sgl); } void -fill_page_dma(const struct i915_page_dma *p, const u64 val, unsigned int count) +fill_page_dma(struct drm_i915_gem_object *p, const u64 val, unsigned int count) { - kunmap_atomic(memset64(kmap_atomic(p->page), val, count)); + struct page *page = __px_page(p); + void *vaddr; + + vaddr = kmap(page); + memset64(vaddr, val, count); + clflush_cache_range(vaddr, PAGE_SIZE); + kunmap(page); } -static void poison_scratch_page(struct page *page, unsigned long size) +static void poison_scratch_page(struct drm_i915_gem_object *scratch) { - if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) - return; + struct sgt_iter sgt; + struct page *page; + u8 val; - GEM_BUG_ON(!IS_ALIGNED(size, PAGE_SIZE)); + val = 0; + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + val = POISON_FREE; - do { + for_each_sgt_page(page, sgt, scratch->mm.pages) { void *vaddr; vaddr = kmap(page); - memset(vaddr, POISON_FREE, PAGE_SIZE); + memset(vaddr, val, PAGE_SIZE); kunmap(page); - - page = pfn_to_page(page_to_pfn(page) + 1); - size -= PAGE_SIZE; - } while (size); + } } -int setup_scratch_page(struct i915_address_space *vm, gfp_t gfp) +int setup_scratch_page(struct i915_address_space *vm) { unsigned long size; @@ -338,21 +179,27 @@ int setup_scratch_page(struct i915_address_space *vm, gfp_t gfp) */ size = I915_GTT_PAGE_SIZE_4K; if (i915_vm_is_4lvl(vm) && - HAS_PAGE_SIZES(vm->i915, I915_GTT_PAGE_SIZE_64K)) { + HAS_PAGE_SIZES(vm->i915, I915_GTT_PAGE_SIZE_64K)) size = I915_GTT_PAGE_SIZE_64K; - gfp |= __GFP_NOWARN; - } - gfp |= __GFP_ZERO | __GFP_RETRY_MAYFAIL; do { - unsigned int order = get_order(size); - struct page *page; - dma_addr_t addr; + struct drm_i915_gem_object *obj; - page = alloc_pages(gfp, order); - if (unlikely(!page)) + obj = vm->alloc_pt_dma(vm, size); + if (IS_ERR(obj)) goto skip; + if (pin_pt_dma(vm, obj)) + goto skip_obj; + + /* We need a single contiguous page for our scratch */ + if (obj->mm.page_sizes.sg < size) + goto skip_obj; + + /* And it needs to be correspondingly aligned */ + if (__px_dma(obj) & (size - 1)) + goto skip_obj; + /* * Use a non-zero scratch page for debugging. * @@ -362,61 +209,28 @@ int setup_scratch_page(struct i915_address_space *vm, gfp_t gfp) * should it ever be accidentally used, the effect should be * fairly benign. */ - poison_scratch_page(page, size); - - addr = dma_map_page_attrs(vm->dma, - page, 0, size, - PCI_DMA_BIDIRECTIONAL, - DMA_ATTR_SKIP_CPU_SYNC | - DMA_ATTR_NO_WARN); - if (unlikely(dma_mapping_error(vm->dma, addr))) - goto free_page; - - if (unlikely(!IS_ALIGNED(addr, size))) - goto unmap_page; - - vm->scratch[0].base.page = page; - vm->scratch[0].base.daddr = addr; - vm->scratch_order = order; + poison_scratch_page(obj); + + vm->scratch[0] = obj; + vm->scratch_order = get_order(size); return 0; -unmap_page: - dma_unmap_page(vm->dma, addr, size, PCI_DMA_BIDIRECTIONAL); -free_page: - __free_pages(page, order); +skip_obj: + i915_gem_object_put(obj); skip: if (size == I915_GTT_PAGE_SIZE_4K) return -ENOMEM; size = I915_GTT_PAGE_SIZE_4K; - gfp &= ~__GFP_NOWARN; } while (1); } -void cleanup_scratch_page(struct i915_address_space *vm) -{ - struct i915_page_dma *p = px_base(&vm->scratch[0]); - unsigned int order = vm->scratch_order; - - dma_unmap_page(vm->dma, p->daddr, BIT(order) << PAGE_SHIFT, - PCI_DMA_BIDIRECTIONAL); - __free_pages(p->page, order); -} - void free_scratch(struct i915_address_space *vm) { int i; - if (!px_dma(&vm->scratch[0])) /* set to 0 on clones */ - return; - - for (i = 1; i <= vm->top; i++) { - if (!px_dma(&vm->scratch[i])) - break; - cleanup_page_dma(vm, px_base(&vm->scratch[i])); - } - - cleanup_scratch_page(vm); + for (i = 0; i <= vm->top; i++) + i915_gem_object_put(vm->scratch[i]); } void gtt_write_workarounds(struct intel_gt *gt) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index f2b75078e05f..c13c650ced22 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -134,38 +134,29 @@ typedef u64 gen8_pte_t; #define GEN8_PDE_IPS_64K BIT(11) #define GEN8_PDE_PS_2M BIT(7) +enum i915_cache_level; + +struct drm_i915_file_private; +struct drm_i915_gem_object; struct i915_fence_reg; +struct i915_vma; +struct intel_gt; #define for_each_sgt_daddr(__dp, __iter, __sgt) \ __for_each_sgt_daddr(__dp, __iter, __sgt, I915_GTT_PAGE_SIZE) -struct i915_page_dma { - struct page *page; +struct i915_page_table { + struct drm_i915_gem_object *base; union { - dma_addr_t daddr; - - /* - * For gen6/gen7 only. This is the offset in the GGTT - * where the page directory entries for PPGTT begin - */ - u32 ggtt_offset; + atomic_t used; + struct i915_page_table *stash; }; }; -struct i915_page_scratch { - struct i915_page_dma base; - u64 encode; -}; - -struct i915_page_table { - struct i915_page_dma base; - atomic_t used; -}; - struct i915_page_directory { struct i915_page_table pt; spinlock_t lock; - void *entry[512]; + void **entry; }; #define __px_choose_expr(x, type, expr, other) \ @@ -176,12 +167,14 @@ struct i915_page_directory { other) #define px_base(px) \ - __px_choose_expr(px, struct i915_page_dma *, __x, \ - __px_choose_expr(px, struct i915_page_scratch *, &__x->base, \ - __px_choose_expr(px, struct i915_page_table *, &__x->base, \ - __px_choose_expr(px, struct i915_page_directory *, &__x->pt.base, \ - (void)0)))) -#define px_dma(px) (px_base(px)->daddr) + __px_choose_expr(px, struct drm_i915_gem_object *, __x, \ + __px_choose_expr(px, struct i915_page_table *, __x->base, \ + __px_choose_expr(px, struct i915_page_directory *, __x->pt.base, \ + (void)0))) + +struct page *__px_page(struct drm_i915_gem_object *p); +dma_addr_t __px_dma(struct drm_i915_gem_object *p); +#define px_dma(px) (__px_dma(px_base(px))) #define px_pt(px) \ __px_choose_expr(px, struct i915_page_table *, __x, \ @@ -189,19 +182,18 @@ struct i915_page_directory { (void)0)) #define px_used(px) (&px_pt(px)->used) -enum i915_cache_level; - -struct drm_i915_file_private; -struct drm_i915_gem_object; -struct i915_vma; -struct intel_gt; +struct i915_vm_pt_stash { + /* preallocated chains of page tables/directories */ + struct i915_page_table *pt[2]; +}; struct i915_vma_ops { /* Map an object into an address space with the given cache flags. */ - int (*bind_vma)(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags); + void (*bind_vma)(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags); /* * Unmap an object from an address space. This usually consists of * setting the valid PTE entries to a reserved scratch page. @@ -213,13 +205,6 @@ struct i915_vma_ops { void (*clear_pages)(struct i915_vma *vma); }; -struct pagestash { - spinlock_t lock; - struct pagevec pvec; -}; - -void stash_init(struct pagestash *stash); - struct i915_address_space { struct kref ref; struct rcu_work rcu; @@ -256,33 +241,33 @@ struct i915_address_space { #define VM_CLASS_GGTT 0 #define VM_CLASS_PPGTT 1 - struct i915_page_scratch scratch[4]; - unsigned int scratch_order; - unsigned int top; - + struct drm_i915_gem_object *scratch[4]; /** * List of vma currently bound. */ struct list_head bound_list; - struct pagestash free_pages; - /* Global GTT */ bool is_ggtt:1; - /* Some systems require uncached updates of the page directories */ - bool pt_kmap_wc:1; - /* Some systems support read-only mappings for GGTT and/or PPGTT */ bool has_read_only:1; + u8 top; + u8 pd_shift; + u8 scratch_order; + + struct drm_i915_gem_object * + (*alloc_pt_dma)(struct i915_address_space *vm, int sz); + u64 (*pte_encode)(dma_addr_t addr, enum i915_cache_level level, u32 flags); /* Create a valid PTE */ #define PTE_READ_ONLY BIT(0) - int (*allocate_va_range)(struct i915_address_space *vm, - u64 start, u64 length); + void (*allocate_va_range)(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 start, u64 length); void (*clear_range)(struct i915_address_space *vm, u64 start, u64 length); void (*insert_page)(struct i915_address_space *vm, @@ -490,9 +475,9 @@ i915_pd_entry(const struct i915_page_directory * const pdp, static inline dma_addr_t i915_page_dir_dma_addr(const struct i915_ppgtt *ppgtt, const unsigned int n) { - struct i915_page_dma *pt = ppgtt->pd->entry[n]; + struct i915_page_table *pt = ppgtt->pd->entry[n]; - return px_dma(pt ?: px_base(&ppgtt->vm.scratch[ppgtt->vm.top])); + return __px_dma(pt ? px_base(pt) : ppgtt->vm.scratch[ppgtt->vm.top]); } void ppgtt_init(struct i915_ppgtt *ppgtt, struct intel_gt *gt); @@ -517,13 +502,10 @@ struct i915_ppgtt *i915_ppgtt_create(struct intel_gt *gt); void i915_ggtt_suspend(struct i915_ggtt *gtt); void i915_ggtt_resume(struct i915_ggtt *ggtt); -int setup_page_dma(struct i915_address_space *vm, struct i915_page_dma *p); -void cleanup_page_dma(struct i915_address_space *vm, struct i915_page_dma *p); - -#define kmap_atomic_px(px) kmap_atomic(px_base(px)->page) +#define kmap_atomic_px(px) kmap_atomic(__px_page(px_base(px))) void -fill_page_dma(const struct i915_page_dma *p, const u64 val, unsigned int count); +fill_page_dma(struct drm_i915_gem_object *p, const u64 val, unsigned int count); #define fill_px(px, v) fill_page_dma(px_base(px), (v), PAGE_SIZE / sizeof(u64)) #define fill32_px(px, v) do { \ @@ -531,47 +513,51 @@ fill_page_dma(const struct i915_page_dma *p, const u64 val, unsigned int count); fill_px((px), v__ << 32 | v__); \ } while (0) -int setup_scratch_page(struct i915_address_space *vm, gfp_t gfp); -void cleanup_scratch_page(struct i915_address_space *vm); +int setup_scratch_page(struct i915_address_space *vm); void free_scratch(struct i915_address_space *vm); +struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz); struct i915_page_table *alloc_pt(struct i915_address_space *vm); struct i915_page_directory *alloc_pd(struct i915_address_space *vm); -struct i915_page_directory *__alloc_pd(size_t sz); +struct i915_page_directory *__alloc_pd(int npde); -void free_pd(struct i915_address_space *vm, struct i915_page_dma *pd); +int pin_pt_dma(struct i915_address_space *vm, struct drm_i915_gem_object *obj); -#define free_px(vm, px) free_pd(vm, px_base(px)) +void free_px(struct i915_address_space *vm, + struct i915_page_table *pt, int lvl); +#define free_pt(vm, px) free_px(vm, px, 0) +#define free_pd(vm, px) free_px(vm, px_pt(px), 1) void __set_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, - struct i915_page_dma * const to, + struct i915_page_table *pt, u64 (*encode)(const dma_addr_t, const enum i915_cache_level)); #define set_pd_entry(pd, idx, to) \ - __set_pd_entry((pd), (idx), px_base(to), gen8_pde_encode) + __set_pd_entry((pd), (idx), px_pt(to), gen8_pde_encode) void clear_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, - const struct i915_page_scratch * const scratch); + const struct drm_i915_gem_object * const scratch); bool release_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, struct i915_page_table * const pt, - const struct i915_page_scratch * const scratch); + const struct drm_i915_gem_object * const scratch); void gen6_ggtt_invalidate(struct i915_ggtt *ggtt); int ggtt_set_pages(struct i915_vma *vma); int ppgtt_set_pages(struct i915_vma *vma); void clear_pages(struct i915_vma *vma); -int ppgtt_bind_vma(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags); +void ppgtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags); void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma); @@ -579,6 +565,14 @@ void gtt_write_workarounds(struct intel_gt *gt); void setup_private_pat(struct intel_uncore *uncore); +int i915_vm_alloc_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 size); +int i915_vm_pin_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash); +void i915_vm_free_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash); + static inline struct sgt_dma { struct scatterlist *sg; dma_addr_t dma, max; diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 9eeaca957a7e..0412a44f25f2 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -137,6 +137,7 @@ #include "i915_perf.h" #include "i915_trace.h" #include "i915_vgpu.h" +#include "intel_breadcrumbs.h" #include "intel_context.h" #include "intel_engine_pm.h" #include "intel_gt.h" @@ -1148,20 +1149,6 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) } else { struct intel_engine_cs *owner = rq->context->engine; - /* - * Decouple the virtual breadcrumb before moving it - * back to the virtual engine -- we don't want the - * request to complete in the background and try - * and cancel the breadcrumb on the virtual engine - * (instead of the old engine where it is linked)! - */ - if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, - &rq->fence.flags)) { - spin_lock_nested(&rq->lock, - SINGLE_DEPTH_NESTING); - i915_request_cancel_breadcrumb(rq); - spin_unlock(&rq->lock); - } WRITE_ONCE(rq->engine, owner); owner->submit_request(rq); active = NULL; @@ -1819,16 +1806,31 @@ static bool virtual_matches(const struct virtual_engine *ve, return true; } -static void virtual_xfer_breadcrumbs(struct virtual_engine *ve) +static void virtual_xfer_context(struct virtual_engine *ve, + struct intel_engine_cs *engine) { + unsigned int n; + + if (likely(engine == ve->siblings[0])) + return; + + GEM_BUG_ON(READ_ONCE(ve->context.inflight)); + if (!intel_engine_has_relative_mmio(engine)) + virtual_update_register_offsets(ve->context.lrc_reg_state, + engine); + /* - * All the outstanding signals on ve->siblings[0] must have - * been completed, just pending the interrupt handler. As those - * signals still refer to the old sibling (via rq->engine), we must - * transfer those to the old irq_worker to keep our locking - * consistent. + * Move the bound engine to the top of the list for + * future execution. We then kick this tasklet first + * before checking others, so that we preferentially + * reuse this set of bound registers. */ - intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context); + for (n = 1; n < ve->num_siblings; n++) { + if (ve->siblings[n] == engine) { + swap(ve->siblings[n], ve->siblings[0]); + break; + } + } } #define for_each_waiter(p__, rq__) \ @@ -2279,38 +2281,23 @@ static void execlists_dequeue(struct intel_engine_cs *engine) GEM_BUG_ON(!(rq->execution_mask & engine->mask)); WRITE_ONCE(rq->engine, engine); - if (engine != ve->siblings[0]) { - u32 *regs = ve->context.lrc_reg_state; - unsigned int n; - - GEM_BUG_ON(READ_ONCE(ve->context.inflight)); - - if (!intel_engine_has_relative_mmio(engine)) - virtual_update_register_offsets(regs, - engine); - - if (!list_empty(&ve->context.signals)) - virtual_xfer_breadcrumbs(ve); - + if (__i915_request_submit(rq)) { /* - * Move the bound engine to the top of the list - * for future execution. We then kick this - * tasklet first before checking others, so that - * we preferentially reuse this set of bound - * registers. + * Only after we confirm that we will submit + * this request (i.e. it has not already + * completed), do we want to update the context. + * + * This serves two purposes. It avoids + * unnecessary work if we are resubmitting an + * already completed request after timeslicing. + * But more importantly, it prevents us altering + * ve->siblings[] on an idle context, where + * we may be using ve->siblings[] in + * virtual_context_enter / virtual_context_exit. */ - for (n = 1; n < ve->num_siblings; n++) { - if (ve->siblings[n] == engine) { - swap(ve->siblings[n], - ve->siblings[0]); - break; - } - } - + virtual_xfer_context(ve, engine); GEM_BUG_ON(ve->siblings[0] != engine); - } - if (__i915_request_submit(rq)) { submit = true; last = rq; } @@ -3316,7 +3303,10 @@ static void execlists_context_unpin(struct intel_context *ce) { check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, ce->engine); +} +static void execlists_context_post_unpin(struct intel_context *ce) +{ i915_gem_object_unpin_map(ce->state->obj); } @@ -3478,20 +3468,24 @@ __execlists_update_reg_state(const struct intel_context *ce, } static int -__execlists_context_pin(struct intel_context *ce, - struct intel_engine_cs *engine) +execlists_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, void **vaddr) { - void *vaddr; - GEM_BUG_ON(!ce->state); GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); - vaddr = i915_gem_object_pin_map(ce->state->obj, - i915_coherent_map_type(engine->i915) | + *vaddr = i915_gem_object_pin_map(ce->state->obj, + i915_coherent_map_type(ce->engine->i915) | I915_MAP_OVERRIDE); - if (IS_ERR(vaddr)) - return PTR_ERR(vaddr); + return PTR_ERR_OR_ZERO(*vaddr); +} + +static int +__execlists_context_pin(struct intel_context *ce, + struct intel_engine_cs *engine, + void *vaddr) +{ ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; __execlists_update_reg_state(ce, engine, ce->ring->tail); @@ -3499,9 +3493,9 @@ __execlists_context_pin(struct intel_context *ce, return 0; } -static int execlists_context_pin(struct intel_context *ce) +static int execlists_context_pin(struct intel_context *ce, void *vaddr) { - return __execlists_context_pin(ce, ce->engine); + return __execlists_context_pin(ce, ce->engine, vaddr); } static int execlists_context_alloc(struct intel_context *ce) @@ -3527,8 +3521,10 @@ static void execlists_context_reset(struct intel_context *ce) static const struct intel_context_ops execlists_context_ops = { .alloc = execlists_context_alloc, + .pre_pin = execlists_context_pre_pin, .pin = execlists_context_pin, .unpin = execlists_context_unpin, + .post_unpin = execlists_context_post_unpin, .enter = intel_context_enter_engine, .exit = intel_context_exit_engine, @@ -3892,7 +3888,7 @@ static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) goto err; } - err = i915_ggtt_pin(vma, 0, PIN_HIGH); + err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); if (err) goto err; @@ -4133,7 +4129,7 @@ static int execlists_resume(struct intel_engine_cs *engine) { intel_mocs_init_engine(engine); - intel_engine_reset_breadcrumbs(engine); + intel_breadcrumbs_reset(engine->breadcrumbs); if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { struct drm_printer p = drm_debug_printer(__func__); @@ -4562,7 +4558,7 @@ static int gen8_emit_flush_render(struct i915_request *request, vf_flush_wa = true; /* WaForGAMHang:kbl */ - if (IS_KBL_REVID(request->engine->i915, 0, KBL_REVID_B0)) + if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0)) dc_flush_wa = true; } @@ -4764,14 +4760,21 @@ static int gen12_emit_flush(struct i915_request *request, u32 mode) intel_engine_mask_t aux_inv = 0; u32 cmd, *cs; + cmd = 4; + if (mode & EMIT_INVALIDATE) + cmd += 2; if (mode & EMIT_INVALIDATE) aux_inv = request->engine->mask & ~BIT(BCS0); + if (aux_inv) + cmd += 2 * hweight8(aux_inv) + 2; - cs = intel_ring_begin(request, - 4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0)); + cs = intel_ring_begin(request, cmd); if (IS_ERR(cs)) return PTR_ERR(cs); + if (mode & EMIT_INVALIDATE) + *cs++ = preparser_disable(true); + cmd = MI_FLUSH_DW + 1; /* We always require a command barrier so that subsequent @@ -4804,6 +4807,10 @@ static int gen12_emit_flush(struct i915_request *request, u32 mode) } *cs++ = MI_NOOP; } + + if (mode & EMIT_INVALIDATE) + *cs++ = preparser_disable(false); + intel_ring_advance(request, cs); return 0; @@ -5302,6 +5309,14 @@ populate_lr_context(struct intel_context *ce, return 0; } +static struct intel_timeline *pinned_timeline(struct intel_context *ce) +{ + struct intel_timeline *tl = fetch_and_zero(&ce->timeline); + + return intel_timeline_create_from_engine(ce->engine, + page_unmask_bits(tl)); +} + static int __execlists_context_alloc(struct intel_context *ce, struct intel_engine_cs *engine) { @@ -5332,19 +5347,17 @@ static int __execlists_context_alloc(struct intel_context *ce, goto error_deref_obj; } - if (!ce->timeline) { + if (!page_mask_bits(ce->timeline)) { struct intel_timeline *tl; - struct i915_vma *hwsp; /* * Use the static global HWSP for the kernel context, and * a dynamically allocated cacheline for everyone else. */ - hwsp = NULL; - if (unlikely(intel_context_is_barrier(ce))) - hwsp = engine->status_page.vma; - - tl = intel_timeline_create(engine->gt, hwsp); + if (unlikely(ce->timeline)) + tl = pinned_timeline(ce); + else + tl = intel_timeline_create(engine->gt); if (IS_ERR(tl)) { ret = PTR_ERR(tl); goto error_deref_obj; @@ -5450,12 +5463,12 @@ static int virtual_context_alloc(struct intel_context *ce) return __execlists_context_alloc(ce, ve->siblings[0]); } -static int virtual_context_pin(struct intel_context *ce) +static int virtual_context_pin(struct intel_context *ce, void *vaddr) { struct virtual_engine *ve = container_of(ce, typeof(*ve), context); /* Note: we must use a real engine class for setting up reg state */ - return __execlists_context_pin(ce, ve->siblings[0]); + return __execlists_context_pin(ce, ve->siblings[0], vaddr); } static void virtual_context_enter(struct intel_context *ce) @@ -5483,8 +5496,10 @@ static void virtual_context_exit(struct intel_context *ce) static const struct intel_context_ops virtual_context_ops = { .alloc = virtual_context_alloc, + .pre_pin = execlists_context_pre_pin, .pin = virtual_context_pin, .unpin = execlists_context_unpin, + .post_unpin = execlists_context_post_unpin, .enter = virtual_context_enter, .exit = virtual_context_exit, @@ -5718,9 +5733,7 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings, snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); - intel_engine_init_breadcrumbs(&ve->base); intel_engine_init_execlists(&ve->base); - ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */ ve->base.cops = &virtual_context_ops; ve->base.request_alloc = execlists_request_alloc; @@ -5737,6 +5750,12 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings, intel_context_init(&ve->context, &ve->base); + ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); + if (!ve->base.breadcrumbs) { + err = -ENOMEM; + goto err_put; + } + for (n = 0; n < count; n++) { struct intel_engine_cs *sibling = siblings[n]; diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c index f0862e924d11..46d9aceda64c 100644 --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c @@ -18,7 +18,8 @@ struct i915_page_table *alloc_pt(struct i915_address_space *vm) if (unlikely(!pt)) return ERR_PTR(-ENOMEM); - if (unlikely(setup_page_dma(vm, &pt->base))) { + pt->base = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(pt->base)) { kfree(pt); return ERR_PTR(-ENOMEM); } @@ -27,14 +28,20 @@ struct i915_page_table *alloc_pt(struct i915_address_space *vm) return pt; } -struct i915_page_directory *__alloc_pd(size_t sz) +struct i915_page_directory *__alloc_pd(int count) { struct i915_page_directory *pd; - pd = kzalloc(sz, I915_GFP_ALLOW_FAIL); + pd = kzalloc(sizeof(*pd), I915_GFP_ALLOW_FAIL); if (unlikely(!pd)) return NULL; + pd->entry = kcalloc(count, sizeof(*pd->entry), I915_GFP_ALLOW_FAIL); + if (unlikely(!pd->entry)) { + kfree(pd); + return NULL; + } + spin_lock_init(&pd->lock); return pd; } @@ -43,11 +50,13 @@ struct i915_page_directory *alloc_pd(struct i915_address_space *vm) { struct i915_page_directory *pd; - pd = __alloc_pd(sizeof(*pd)); + pd = __alloc_pd(I915_PDES); if (unlikely(!pd)) return ERR_PTR(-ENOMEM); - if (unlikely(setup_page_dma(vm, px_base(pd)))) { + pd->pt.base = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(pd->pt.base)) { + kfree(pd->entry); kfree(pd); return ERR_PTR(-ENOMEM); } @@ -55,41 +64,52 @@ struct i915_page_directory *alloc_pd(struct i915_address_space *vm) return pd; } -void free_pd(struct i915_address_space *vm, struct i915_page_dma *pd) +void free_px(struct i915_address_space *vm, struct i915_page_table *pt, int lvl) { - cleanup_page_dma(vm, pd); - kfree(pd); + BUILD_BUG_ON(offsetof(struct i915_page_directory, pt)); + + if (lvl) { + struct i915_page_directory *pd = + container_of(pt, typeof(*pd), pt); + kfree(pd->entry); + } + + if (pt->base) + i915_gem_object_put(pt->base); + + kfree(pt); } static inline void -write_dma_entry(struct i915_page_dma * const pdma, +write_dma_entry(struct drm_i915_gem_object * const pdma, const unsigned short idx, const u64 encoded_entry) { - u64 * const vaddr = kmap_atomic(pdma->page); + u64 * const vaddr = kmap_atomic(__px_page(pdma)); vaddr[idx] = encoded_entry; + clflush_cache_range(&vaddr[idx], sizeof(u64)); kunmap_atomic(vaddr); } void __set_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, - struct i915_page_dma * const to, + struct i915_page_table * const to, u64 (*encode)(const dma_addr_t, const enum i915_cache_level)) { /* Each thread pre-pins the pd, and we may have a thread per pde. */ - GEM_BUG_ON(atomic_read(px_used(pd)) > NALLOC * ARRAY_SIZE(pd->entry)); + GEM_BUG_ON(atomic_read(px_used(pd)) > NALLOC * I915_PDES); atomic_inc(px_used(pd)); pd->entry[idx] = to; - write_dma_entry(px_base(pd), idx, encode(to->daddr, I915_CACHE_LLC)); + write_dma_entry(px_base(pd), idx, encode(px_dma(to), I915_CACHE_LLC)); } void clear_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, - const struct i915_page_scratch * const scratch) + const struct drm_i915_gem_object * const scratch) { GEM_BUG_ON(atomic_read(px_used(pd)) == 0); @@ -102,7 +122,7 @@ bool release_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, struct i915_page_table * const pt, - const struct i915_page_scratch * const scratch) + const struct drm_i915_gem_object * const scratch) { bool free = false; @@ -155,19 +175,16 @@ struct i915_ppgtt *i915_ppgtt_create(struct intel_gt *gt) return ppgtt; } -int ppgtt_bind_vma(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags) +void ppgtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) { u32 pte_flags; - int err; if (!test_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma))) { - err = vm->allocate_va_range(vm, vma->node.start, vma->size); - if (err) - return err; - + vm->allocate_va_range(vm, stash, vma->node.start, vma->size); set_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma)); } @@ -178,8 +195,6 @@ int ppgtt_bind_vma(struct i915_address_space *vm, vm->insert_entries(vm, vma, cache_level, pte_flags); wmb(); - - return 0; } void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma) @@ -188,12 +203,93 @@ void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma) vm->clear_range(vm, vma->node.start, vma->size); } +static unsigned long pd_count(u64 size, int shift) +{ + /* Beware later misalignment */ + return (size + 2 * (BIT_ULL(shift) - 1)) >> shift; +} + +int i915_vm_alloc_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 size) +{ + unsigned long count; + int shift, n; + + shift = vm->pd_shift; + if (!shift) + return 0; + + count = pd_count(size, shift); + while (count--) { + struct i915_page_table *pt; + + pt = alloc_pt(vm); + if (IS_ERR(pt)) { + i915_vm_free_pt_stash(vm, stash); + return PTR_ERR(pt); + } + + pt->stash = stash->pt[0]; + stash->pt[0] = pt; + } + + for (n = 1; n < vm->top; n++) { + shift += ilog2(I915_PDES); /* Each PD holds 512 entries */ + count = pd_count(size, shift); + while (count--) { + struct i915_page_directory *pd; + + pd = alloc_pd(vm); + if (IS_ERR(pd)) { + i915_vm_free_pt_stash(vm, stash); + return PTR_ERR(pd); + } + + pd->pt.stash = stash->pt[1]; + stash->pt[1] = &pd->pt; + } + } + + return 0; +} + +int i915_vm_pin_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash) +{ + struct i915_page_table *pt; + int n, err; + + for (n = 0; n < ARRAY_SIZE(stash->pt); n++) { + for (pt = stash->pt[n]; pt; pt = pt->stash) { + err = pin_pt_dma(vm, pt->base); + if (err) + return err; + } + } + + return 0; +} + +void i915_vm_free_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash) +{ + struct i915_page_table *pt; + int n; + + for (n = 0; n < ARRAY_SIZE(stash->pt); n++) { + while ((pt = stash->pt[n])) { + stash->pt[n] = pt->stash; + free_px(vm, pt, n); + } + } +} + int ppgtt_set_pages(struct i915_vma *vma) { GEM_BUG_ON(vma->pages); vma->pages = vma->obj->mm.pages; - vma->page_sizes = vma->obj->mm.page_sizes; return 0; diff --git a/drivers/gpu/drm/i915/gt/intel_renderstate.c b/drivers/gpu/drm/i915/gt/intel_renderstate.c index 1bfad589c63b..ea2a77c7b469 100644 --- a/drivers/gpu/drm/i915/gt/intel_renderstate.c +++ b/drivers/gpu/drm/i915/gt/intel_renderstate.c @@ -27,6 +27,7 @@ #include "i915_drv.h" #include "intel_renderstate.h" +#include "gt/intel_context.h" #include "intel_ring.h" static const struct intel_renderstate_rodata * @@ -157,33 +158,47 @@ out: #undef OUT_BATCH int intel_renderstate_init(struct intel_renderstate *so, - struct intel_engine_cs *engine) + struct intel_context *ce) { - struct drm_i915_gem_object *obj; + struct intel_engine_cs *engine = ce->engine; + struct drm_i915_gem_object *obj = NULL; int err; memset(so, 0, sizeof(*so)); so->rodata = render_state_get_rodata(engine); - if (!so->rodata) - return 0; + if (so->rodata) { + if (so->rodata->batch_items * 4 > PAGE_SIZE) + return -EINVAL; + + obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + so->vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); + if (IS_ERR(so->vma)) { + err = PTR_ERR(so->vma); + goto err_obj; + } + } - if (so->rodata->batch_items * 4 > PAGE_SIZE) - return -EINVAL; + i915_gem_ww_ctx_init(&so->ww, true); +retry: + err = intel_context_pin_ww(ce, &so->ww); + if (err) + goto err_fini; - obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); - if (IS_ERR(obj)) - return PTR_ERR(obj); + /* return early if there's nothing to setup */ + if (!err && !so->rodata) + return 0; - so->vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); - if (IS_ERR(so->vma)) { - err = PTR_ERR(so->vma); - goto err_obj; - } + err = i915_gem_object_lock(so->vma->obj, &so->ww); + if (err) + goto err_context; err = i915_vma_pin(so->vma, 0, 0, PIN_GLOBAL | PIN_HIGH); if (err) - goto err_obj; + goto err_context; err = render_state_setup(so, engine->i915); if (err) @@ -193,8 +208,18 @@ int intel_renderstate_init(struct intel_renderstate *so, err_unpin: i915_vma_unpin(so->vma); +err_context: + intel_context_unpin(ce); +err_fini: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&so->ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&so->ww); err_obj: - i915_gem_object_put(obj); + if (obj) + i915_gem_object_put(obj); so->vma = NULL; return err; } @@ -208,11 +233,9 @@ int intel_renderstate_emit(struct intel_renderstate *so, if (!so->vma) return 0; - i915_vma_lock(so->vma); err = i915_request_await_object(rq, so->vma->obj, false); if (err == 0) err = i915_vma_move_to_active(so->vma, rq, 0); - i915_vma_unlock(so->vma); if (err) return err; @@ -233,7 +256,17 @@ int intel_renderstate_emit(struct intel_renderstate *so, return 0; } -void intel_renderstate_fini(struct intel_renderstate *so) +void intel_renderstate_fini(struct intel_renderstate *so, + struct intel_context *ce) { - i915_vma_unpin_and_release(&so->vma, 0); + if (so->vma) { + i915_vma_unpin(so->vma); + i915_vma_close(so->vma); + } + + intel_context_unpin(ce); + i915_gem_ww_ctx_fini(&so->ww); + + if (so->vma) + i915_gem_object_put(so->vma->obj); } diff --git a/drivers/gpu/drm/i915/gt/intel_renderstate.h b/drivers/gpu/drm/i915/gt/intel_renderstate.h index 5700be69a05a..713aa1e86c80 100644 --- a/drivers/gpu/drm/i915/gt/intel_renderstate.h +++ b/drivers/gpu/drm/i915/gt/intel_renderstate.h @@ -25,9 +25,10 @@ #define _INTEL_RENDERSTATE_H_ #include <linux/types.h> +#include "i915_gem.h" struct i915_request; -struct intel_engine_cs; +struct intel_context; struct i915_vma; struct intel_renderstate_rodata { @@ -49,6 +50,7 @@ extern const struct intel_renderstate_rodata gen8_null_state; extern const struct intel_renderstate_rodata gen9_null_state; struct intel_renderstate { + struct i915_gem_ww_ctx ww; const struct intel_renderstate_rodata *rodata; struct i915_vma *vma; u32 batch_offset; @@ -58,9 +60,10 @@ struct intel_renderstate { }; int intel_renderstate_init(struct intel_renderstate *so, - struct intel_engine_cs *engine); + struct intel_context *ce); int intel_renderstate_emit(struct intel_renderstate *so, struct i915_request *rq); -void intel_renderstate_fini(struct intel_renderstate *so); +void intel_renderstate_fini(struct intel_renderstate *so, + struct intel_context *ce); #endif /* _INTEL_RENDERSTATE_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 46a5ceffc22f..ac36b67fb46b 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -15,6 +15,7 @@ #include "i915_drv.h" #include "i915_gpu_error.h" #include "i915_irq.h" +#include "intel_breadcrumbs.h" #include "intel_engine_pm.h" #include "intel_gt.h" #include "intel_gt_pm.h" diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c index bdb324167ef3..4034a4bac7f0 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring.c +++ b/drivers/gpu/drm/i915/gt/intel_ring.c @@ -21,7 +21,13 @@ unsigned int intel_ring_update_space(struct intel_ring *ring) return space; } -int intel_ring_pin(struct intel_ring *ring) +void __intel_ring_pin(struct intel_ring *ring) +{ + GEM_BUG_ON(!atomic_read(&ring->pin_count)); + atomic_inc(&ring->pin_count); +} + +int intel_ring_pin(struct intel_ring *ring, struct i915_gem_ww_ctx *ww) { struct i915_vma *vma = ring->vma; unsigned int flags; @@ -39,7 +45,7 @@ int intel_ring_pin(struct intel_ring *ring) else flags |= PIN_HIGH; - ret = i915_ggtt_pin(vma, 0, flags); + ret = i915_ggtt_pin(vma, ww, 0, flags); if (unlikely(ret)) goto err_unpin; diff --git a/drivers/gpu/drm/i915/gt/intel_ring.h b/drivers/gpu/drm/i915/gt/intel_ring.h index cc0ebca65167..1700579bdc93 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring.h +++ b/drivers/gpu/drm/i915/gt/intel_ring.h @@ -21,7 +21,8 @@ int intel_ring_cacheline_align(struct i915_request *rq); unsigned int intel_ring_update_space(struct intel_ring *ring); -int intel_ring_pin(struct intel_ring *ring); +void __intel_ring_pin(struct intel_ring *ring); +int intel_ring_pin(struct intel_ring *ring, struct i915_gem_ww_ctx *ww); void intel_ring_unpin(struct intel_ring *ring); void intel_ring_reset(struct intel_ring *ring, u32 tail); diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c index 898593ca4889..16b48e72c369 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -32,6 +32,7 @@ #include "gen6_ppgtt.h" #include "gen7_renderclear.h" #include "i915_drv.h" +#include "intel_breadcrumbs.h" #include "intel_context.h" #include "intel_gt.h" #include "intel_reset.h" @@ -201,16 +202,18 @@ static struct i915_address_space *vm_alias(struct i915_address_space *vm) return vm; } +static u32 pp_dir(struct i915_address_space *vm) +{ + return to_gen6_ppgtt(i915_vm_to_ppgtt(vm))->pp_dir; +} + static void set_pp_dir(struct intel_engine_cs *engine) { struct i915_address_space *vm = vm_alias(engine->gt->vm); if (vm) { - struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm); - ENGINE_WRITE(engine, RING_PP_DIR_DCLV, PP_DIR_DCLV_2G); - ENGINE_WRITE(engine, RING_PP_DIR_BASE, - px_base(ppgtt->pd)->ggtt_offset << 10); + ENGINE_WRITE(engine, RING_PP_DIR_BASE, pp_dir(vm)); } } @@ -255,7 +258,7 @@ static int xcs_resume(struct intel_engine_cs *engine) else ring_setup_status_page(engine); - intel_engine_reset_breadcrumbs(engine); + intel_breadcrumbs_reset(engine->breadcrumbs); /* Enforce ordering by reading HEAD register back */ ENGINE_POSTING_READ(engine, RING_HEAD); @@ -474,14 +477,16 @@ static void ring_context_destroy(struct kref *ref) intel_context_free(ce); } -static int __context_pin_ppgtt(struct intel_context *ce) +static int ring_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, + void **unused) { struct i915_address_space *vm; int err = 0; vm = vm_alias(ce->vm); if (vm) - err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm))); + err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm)), ww); return err; } @@ -497,6 +502,10 @@ static void __context_unpin_ppgtt(struct intel_context *ce) static void ring_context_unpin(struct intel_context *ce) { +} + +static void ring_context_post_unpin(struct intel_context *ce) +{ __context_unpin_ppgtt(ce); } @@ -584,9 +593,9 @@ static int ring_context_alloc(struct intel_context *ce) return 0; } -static int ring_context_pin(struct intel_context *ce) +static int ring_context_pin(struct intel_context *ce, void *unused) { - return __context_pin_ppgtt(ce); + return 0; } static void ring_context_reset(struct intel_context *ce) @@ -597,8 +606,10 @@ static void ring_context_reset(struct intel_context *ce) static const struct intel_context_ops ring_context_ops = { .alloc = ring_context_alloc, + .pre_pin = ring_context_pre_pin, .pin = ring_context_pin, .unpin = ring_context_unpin, + .post_unpin = ring_context_post_unpin, .enter = intel_context_enter_engine, .exit = intel_context_exit_engine, @@ -608,7 +619,7 @@ static const struct intel_context_ops ring_context_ops = { }; static int load_pd_dir(struct i915_request *rq, - const struct i915_ppgtt *ppgtt, + struct i915_address_space *vm, u32 valid) { const struct intel_engine_cs * const engine = rq->engine; @@ -624,7 +635,7 @@ static int load_pd_dir(struct i915_request *rq, *cs++ = MI_LOAD_REGISTER_IMM(1); *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base)); - *cs++ = px_base(ppgtt->pd)->ggtt_offset << 10; + *cs++ = pp_dir(vm); /* Stall until the page table load is complete? */ *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; @@ -826,7 +837,7 @@ static int switch_mm(struct i915_request *rq, struct i915_address_space *vm) * post-sync op, this extra pass appears vital before a * mm switch! */ - ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm), PP_DIR_DCLV_2G); + ret = load_pd_dir(rq, vm, PP_DIR_DCLV_2G); if (ret) return ret; @@ -1250,14 +1261,15 @@ int intel_ring_submission_setup(struct intel_engine_cs *engine) return -ENODEV; } - timeline = intel_timeline_create(engine->gt, engine->status_page.vma); + timeline = intel_timeline_create_from_engine(engine, + I915_GEM_HWS_SEQNO_ADDR); if (IS_ERR(timeline)) { err = PTR_ERR(timeline); goto err; } GEM_BUG_ON(timeline->has_initial_breadcrumb); - err = intel_timeline_pin(timeline); + err = intel_timeline_pin(timeline, NULL); if (err) goto err_timeline; @@ -1267,7 +1279,7 @@ int intel_ring_submission_setup(struct intel_engine_cs *engine) goto err_timeline_unpin; } - err = intel_ring_pin(ring); + err = intel_ring_pin(ring, NULL); if (err) goto err_ring; diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index 97ba14ad52e4..e6a00eea0631 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -7,6 +7,7 @@ #include <drm/i915_drm.h> #include "i915_drv.h" +#include "intel_breadcrumbs.h" #include "intel_gt.h" #include "intel_gt_clock_utils.h" #include "intel_gt_irq.h" diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c index 46d20f5f3ddc..a2f74cefe4c3 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.c +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c @@ -215,7 +215,8 @@ static void cacheline_free(struct intel_timeline_cacheline *cl) static int intel_timeline_init(struct intel_timeline *timeline, struct intel_gt *gt, - struct i915_vma *hwsp) + struct i915_vma *hwsp, + unsigned int offset) { void *vaddr; @@ -246,8 +247,7 @@ static int intel_timeline_init(struct intel_timeline *timeline, vaddr = page_mask_bits(cl->vaddr); } else { - timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR; - + timeline->hwsp_offset = offset; vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB); if (IS_ERR(vaddr)) return PTR_ERR(vaddr); @@ -297,7 +297,9 @@ static void intel_timeline_fini(struct intel_timeline *timeline) } struct intel_timeline * -intel_timeline_create(struct intel_gt *gt, struct i915_vma *global_hwsp) +__intel_timeline_create(struct intel_gt *gt, + struct i915_vma *global_hwsp, + unsigned int offset) { struct intel_timeline *timeline; int err; @@ -306,7 +308,7 @@ intel_timeline_create(struct intel_gt *gt, struct i915_vma *global_hwsp) if (!timeline) return ERR_PTR(-ENOMEM); - err = intel_timeline_init(timeline, gt, global_hwsp); + err = intel_timeline_init(timeline, gt, global_hwsp, offset); if (err) { kfree(timeline); return ERR_PTR(err); @@ -315,14 +317,20 @@ intel_timeline_create(struct intel_gt *gt, struct i915_vma *global_hwsp) return timeline; } -int intel_timeline_pin(struct intel_timeline *tl) +void __intel_timeline_pin(struct intel_timeline *tl) +{ + GEM_BUG_ON(!atomic_read(&tl->pin_count)); + atomic_inc(&tl->pin_count); +} + +int intel_timeline_pin(struct intel_timeline *tl, struct i915_gem_ww_ctx *ww) { int err; if (atomic_add_unless(&tl->pin_count, 1, 0)) return 0; - err = i915_ggtt_pin(tl->hwsp_ggtt, 0, PIN_HIGH); + err = i915_ggtt_pin(tl->hwsp_ggtt, ww, 0, PIN_HIGH); if (err) return err; @@ -465,7 +473,7 @@ __intel_timeline_get_seqno(struct intel_timeline *tl, goto err_rollback; } - err = i915_ggtt_pin(vma, 0, PIN_HIGH); + err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); if (err) { __idle_hwsp_free(vma->private, cacheline); goto err_rollback; @@ -484,7 +492,9 @@ __intel_timeline_get_seqno(struct intel_timeline *tl, * free it after the current request is retired, which ensures that * all writes into the cacheline from previous requests are complete. */ - err = i915_active_ref(&tl->hwsp_cacheline->active, tl, &rq->fence); + err = i915_active_ref(&tl->hwsp_cacheline->active, + tl->fence_context, + &rq->fence); if (err) goto err_cacheline; diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.h b/drivers/gpu/drm/i915/gt/intel_timeline.h index 4298b9ac7327..9882cd911d8e 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.h +++ b/drivers/gpu/drm/i915/gt/intel_timeline.h @@ -29,10 +29,27 @@ #include "i915_active.h" #include "i915_syncmap.h" -#include "gt/intel_timeline_types.h" +#include "intel_timeline_types.h" struct intel_timeline * -intel_timeline_create(struct intel_gt *gt, struct i915_vma *global_hwsp); +__intel_timeline_create(struct intel_gt *gt, + struct i915_vma *global_hwsp, + unsigned int offset); + +static inline struct intel_timeline * +intel_timeline_create(struct intel_gt *gt) +{ + return __intel_timeline_create(gt, NULL, 0); +} + +static inline struct intel_timeline * +intel_timeline_create_from_engine(struct intel_engine_cs *engine, + unsigned int offset) +{ + return __intel_timeline_create(engine->gt, + engine->status_page.vma, + offset); +} static inline struct intel_timeline * intel_timeline_get(struct intel_timeline *timeline) @@ -71,7 +88,8 @@ static inline bool intel_timeline_sync_is_later(struct intel_timeline *tl, return __intel_timeline_sync_is_later(tl, fence->context, fence->seqno); } -int intel_timeline_pin(struct intel_timeline *tl); +void __intel_timeline_pin(struct intel_timeline *tl); +int intel_timeline_pin(struct intel_timeline *tl, struct i915_gem_ww_ctx *ww); void intel_timeline_enter(struct intel_timeline *tl); int intel_timeline_get_seqno(struct intel_timeline *tl, struct i915_request *rq, diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 5726cd0a37e0..6c580d0d9ea8 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -52,6 +52,37 @@ * - Public functions to init or apply the given workaround type. */ +/* + * KBL revision ID ordering is bizarre; higher revision ID's map to lower + * steppings in some cases. So rather than test against the revision ID + * directly, let's map that into our own range of increasing ID's that we + * can test against in a regular manner. + */ + +const struct i915_rev_steppings kbl_revids[] = { + [0] = { .gt_stepping = KBL_REVID_A0, .disp_stepping = KBL_REVID_A0 }, + [1] = { .gt_stepping = KBL_REVID_B0, .disp_stepping = KBL_REVID_B0 }, + [2] = { .gt_stepping = KBL_REVID_C0, .disp_stepping = KBL_REVID_B0 }, + [3] = { .gt_stepping = KBL_REVID_D0, .disp_stepping = KBL_REVID_B0 }, + [4] = { .gt_stepping = KBL_REVID_F0, .disp_stepping = KBL_REVID_C0 }, + [5] = { .gt_stepping = KBL_REVID_C0, .disp_stepping = KBL_REVID_B1 }, + [6] = { .gt_stepping = KBL_REVID_D1, .disp_stepping = KBL_REVID_B1 }, + [7] = { .gt_stepping = KBL_REVID_G0, .disp_stepping = KBL_REVID_C0 }, +}; + +const struct i915_rev_steppings tgl_uy_revids[] = { + [0] = { .gt_stepping = TGL_REVID_A0, .disp_stepping = TGL_REVID_A0 }, + [1] = { .gt_stepping = TGL_REVID_B0, .disp_stepping = TGL_REVID_C0 }, + [2] = { .gt_stepping = TGL_REVID_B1, .disp_stepping = TGL_REVID_C0 }, + [3] = { .gt_stepping = TGL_REVID_C0, .disp_stepping = TGL_REVID_D0 }, +}; + +/* Same GT stepping between tgl_uy_revids and tgl_revids don't mean the same HW */ +const struct i915_rev_steppings tgl_revids[] = { + [0] = { .gt_stepping = TGL_REVID_A0, .disp_stepping = TGL_REVID_B0 }, + [1] = { .gt_stepping = TGL_REVID_B0, .disp_stepping = TGL_REVID_D0 }, +}; + static void wa_init_start(struct i915_wa_list *wal, const char *name, const char *engine_name) { wal->name = name; @@ -470,7 +501,7 @@ static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine, gen9_ctx_workarounds_init(engine, wal); /* WaToEnableHwFixForPushConstHWBug:kbl */ - if (IS_KBL_REVID(i915, KBL_REVID_C0, REVID_FOREVER)) + if (IS_KBL_GT_REVID(i915, KBL_REVID_C0, REVID_FOREVER)) WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); @@ -596,8 +627,8 @@ static void icl_ctx_workarounds_init(struct intel_engine_cs *engine, wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU); } -static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine, - struct i915_wa_list *wal) +static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) { /* * Wa_1409142259:tgl @@ -607,12 +638,28 @@ static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine, * Wa_1409207793:tgl * Wa_1409178076:tgl * Wa_1408979724:tgl + * Wa_14010443199:rkl + * Wa_14010698770:rkl */ WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3, GEN12_DISABLE_CPS_AWARE_COLOR_PIPE); + /* WaDisableGPGPUMidThreadPreemption:gen12 */ + WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + GEN9_PREEMPT_GPGPU_LEVEL_MASK, + GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL); +} + +static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen12_ctx_workarounds_init(engine, wal); + /* - * Wa_1604555607:gen12 and Wa_1608008084:gen12 + * Wa_1604555607:tgl,rkl + * + * Note that the implementation of this workaround is further modified + * according to the FF_MODE2 guidance given by Wa_1608008084:gen12. * FF_MODE2 register will return the wrong value when read. The default * value for this register is zero for all fields and there are no bit * masks. So instead of doing a RMW we should just write the GS Timer @@ -623,11 +670,6 @@ static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine, FF_MODE2_GS_TIMER_MASK | FF_MODE2_TDS_TIMER_MASK, FF_MODE2_GS_TIMER_224 | FF_MODE2_TDS_TIMER_128, 0); - - /* WaDisableGPGPUMidThreadPreemption:tgl */ - WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, - GEN9_PREEMPT_GPGPU_LEVEL_MASK, - GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL); } static void @@ -642,8 +684,10 @@ __intel_engine_init_ctx_wa(struct intel_engine_cs *engine, wa_init_start(wal, name, engine->name); - if (IS_GEN(i915, 12)) + if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) tgl_ctx_workarounds_init(engine, wal); + else if (IS_GEN(i915, 12)) + gen12_ctx_workarounds_init(engine, wal); else if (IS_GEN(i915, 11)) icl_ctx_workarounds_init(engine, wal); else if (IS_CANNONLAKE(i915)) @@ -995,7 +1039,7 @@ kbl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) gen9_gt_workarounds_init(i915, wal); /* WaDisableDynamicCreditSharing:kbl */ - if (IS_KBL_REVID(i915, 0, KBL_REVID_B0)) + if (IS_KBL_GT_REVID(i915, 0, KBL_REVID_B0)) wa_write_or(wal, GAMT_CHKN_BIT_REG, GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING); @@ -1176,18 +1220,25 @@ icl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) } static void -tgl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +gen12_gt_workarounds_init(struct drm_i915_private *i915, + struct i915_wa_list *wal) { wa_init_mcr(i915, wal); +} + +static void +tgl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen12_gt_workarounds_init(i915, wal); /* Wa_1409420604:tgl */ - if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) + if (IS_TGL_UY_GT_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) wa_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2, CPSSUNIT_CLKGATE_DIS); /* Wa_1607087056:tgl also know as BUG:1409180338 */ - if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) + if (IS_TGL_UY_GT_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) wa_write_or(wal, SLICE_UNIT_LEVEL_CLKGATE, L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS); @@ -1196,8 +1247,10 @@ tgl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) static void gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal) { - if (IS_GEN(i915, 12)) + if (IS_TIGERLAKE(i915)) tgl_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 12)) + gen12_gt_workarounds_init(i915, wal); else if (IS_GEN(i915, 11)) icl_gt_workarounds_init(i915, wal); else if (IS_CANNONLAKE(i915)) @@ -1620,7 +1673,7 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { struct drm_i915_private *i915 = engine->i915; - if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) { + if (IS_TGL_UY_GT_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) { /* * Wa_1607138336:tgl * Wa_1607063988:tgl @@ -1630,18 +1683,6 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) GEN12_DISABLE_POSH_BUSY_FF_DOP_CG); /* - * Wa_1607030317:tgl - * Wa_1607186500:tgl - * Wa_1607297627:tgl there is 3 entries for this WA on BSpec, 2 - * of then says it is fixed on B0 the other one says it is - * permanent - */ - wa_masked_en(wal, - GEN6_RC_SLEEP_PSMI_CONTROL, - GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE | - GEN8_RC_SEMA_IDLE_MSG_DISABLE); - - /* * Wa_1606679103:tgl * (see also Wa_1606682166:icl) */ @@ -1654,22 +1695,17 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) VSUNIT_CLKGATE_DIS_TGL); } - if (IS_TIGERLAKE(i915)) { - /* Wa_1606931601:tgl */ + if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) { + /* Wa_1606931601:tgl,rkl */ wa_masked_en(wal, GEN7_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ); - /* Wa_1409804808:tgl */ + /* Wa_1409804808:tgl,rkl */ wa_masked_en(wal, GEN7_ROW_CHICKEN2, GEN12_PUSH_CONST_DEREF_HOLD_DIS); - /* Wa_1606700617:tgl */ - wa_masked_en(wal, - GEN9_CS_DEBUG_MODE1, - FF_DOP_CLOCK_GATE_DISABLE); - /* * Wa_1409085225:tgl - * Wa_14010229206:tgl + * Wa_14010229206:tgl,rkl */ wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH); @@ -1677,9 +1713,37 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) * Wa_1407928979:tgl A* * Wa_18011464164:tgl B0+ * Wa_22010931296:tgl B0+ + * Wa_14010919138:rkl,tgl */ wa_write_or(wal, GEN7_FF_THREAD_MODE, GEN12_FF_TESSELATION_DOP_GATE_DISABLE); + + /* + * Wa_1607030317:tgl + * Wa_1607186500:tgl + * Wa_1607297627:tgl,rkl there are multiple entries for this + * WA in the BSpec; some indicate this is an A0-only WA, + * others indicate it applies to all steppings. + */ + wa_masked_en(wal, + GEN6_RC_SLEEP_PSMI_CONTROL, + GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE | + GEN8_RC_SEMA_IDLE_MSG_DISABLE); + + /* + * Wa_1606700617:tgl + * Wa_22010271021:tgl,rkl + */ + wa_masked_en(wal, + GEN9_CS_DEBUG_MODE1, + FF_DOP_CLOCK_GATE_DISABLE); + } + + if (IS_GEN(i915, 12)) { + /* Wa_1406941453:gen12 */ + wa_masked_en(wal, + GEN10_SAMPLER_MODE, + ENABLE_SMALLPL); } if (IS_GEN(i915, 11)) { @@ -1898,7 +1962,7 @@ xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) struct drm_i915_private *i915 = engine->i915; /* WaKBLVECSSemaphoreWaitPoll:kbl */ - if (IS_KBL_REVID(i915, KBL_REVID_A0, KBL_REVID_E0)) { + if (IS_KBL_GT_REVID(i915, KBL_REVID_A0, KBL_REVID_E0)) { wa_write(wal, RING_SEMA_WAIT_POLL(engine->mmio_base), 1); @@ -2045,6 +2109,7 @@ static int engine_wa_list_verify(struct intel_context *ce, const struct i915_wa *wa; struct i915_request *rq; struct i915_vma *vma; + struct i915_gem_ww_ctx ww; unsigned int i; u32 *results; int err; @@ -2057,29 +2122,34 @@ static int engine_wa_list_verify(struct intel_context *ce, return PTR_ERR(vma); intel_engine_pm_get(ce->engine); - rq = intel_context_create_request(ce); - intel_engine_pm_put(ce->engine); + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(vma->obj, &ww); + if (err == 0) + err = intel_context_pin_ww(ce, &ww); + if (err) + goto err_pm; + + rq = i915_request_create(ce); if (IS_ERR(rq)) { err = PTR_ERR(rq); - goto err_vma; + goto err_unpin; } - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, true); if (err == 0) err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(vma); - if (err) { - i915_request_add(rq); - goto err_vma; - } - - err = wa_list_srm(rq, wal, vma); - if (err) - goto err_vma; + if (err == 0) + err = wa_list_srm(rq, wal, vma); i915_request_get(rq); + if (err) + i915_request_set_error_once(rq, err); i915_request_add(rq); + + if (err) + goto err_rq; + if (i915_request_wait(rq, 0, HZ / 5) < 0) { err = -ETIME; goto err_rq; @@ -2104,7 +2174,16 @@ static int engine_wa_list_verify(struct intel_context *ce, err_rq: i915_request_put(rq); -err_vma: +err_unpin: + intel_context_unpin(ce); +err_pm: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + intel_engine_pm_put(ce->engine); i915_vma_unpin(vma); i915_vma_put(vma); return err; diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c index b8dd3cbc8696..dfd1cfb8a7ec 100644 --- a/drivers/gpu/drm/i915/gt/mock_engine.c +++ b/drivers/gpu/drm/i915/gt/mock_engine.c @@ -131,6 +131,10 @@ static void mock_context_unpin(struct intel_context *ce) { } +static void mock_context_post_unpin(struct intel_context *ce) +{ +} + static void mock_context_destroy(struct kref *ref) { struct intel_context *ce = container_of(ref, typeof(*ce), ref); @@ -152,8 +156,7 @@ static int mock_context_alloc(struct intel_context *ce) if (!ce->ring) return -ENOMEM; - GEM_BUG_ON(ce->timeline); - ce->timeline = intel_timeline_create(ce->engine->gt, NULL); + ce->timeline = intel_timeline_create(ce->engine->gt); if (IS_ERR(ce->timeline)) { kfree(ce->engine); return PTR_ERR(ce->timeline); @@ -164,7 +167,13 @@ static int mock_context_alloc(struct intel_context *ce) return 0; } -static int mock_context_pin(struct intel_context *ce) +static int mock_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, void **unused) +{ + return 0; +} + +static int mock_context_pin(struct intel_context *ce, void *unused) { return 0; } @@ -176,8 +185,10 @@ static void mock_context_reset(struct intel_context *ce) static const struct intel_context_ops mock_context_ops = { .alloc = mock_context_alloc, + .pre_pin = mock_context_pre_pin, .pin = mock_context_pin, .unpin = mock_context_unpin, + .post_unpin = mock_context_post_unpin, .enter = intel_context_enter_engine, .exit = intel_context_exit_engine, @@ -261,11 +272,12 @@ static void mock_engine_release(struct intel_engine_cs *engine) GEM_BUG_ON(timer_pending(&mock->hw_delay)); + intel_breadcrumbs_free(engine->breadcrumbs); + intel_context_unpin(engine->kernel_context); intel_context_put(engine->kernel_context); intel_engine_fini_retire(engine); - intel_engine_fini_breadcrumbs(engine); } struct intel_engine_cs *mock_engine(struct drm_i915_private *i915, @@ -323,20 +335,26 @@ int mock_engine_init(struct intel_engine_cs *engine) struct intel_context *ce; intel_engine_init_active(engine, ENGINE_MOCK); - intel_engine_init_breadcrumbs(engine); intel_engine_init_execlists(engine); intel_engine_init__pm(engine); intel_engine_init_retire(engine); + engine->breadcrumbs = intel_breadcrumbs_create(NULL); + if (!engine->breadcrumbs) + return -ENOMEM; + ce = create_kernel_context(engine); if (IS_ERR(ce)) goto err_breadcrumbs; + /* We insist the kernel context is using the status_page */ + engine->status_page.vma = ce->timeline->hwsp_ggtt; + engine->kernel_context = ce; return 0; err_breadcrumbs: - intel_engine_fini_breadcrumbs(engine); + intel_breadcrumbs_free(engine->breadcrumbs); return -ENOMEM; } diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c index 52af1cee9a94..1f4020e906a8 100644 --- a/drivers/gpu/drm/i915/gt/selftest_context.c +++ b/drivers/gpu/drm/i915/gt/selftest_context.c @@ -68,6 +68,8 @@ static int context_sync(struct intel_context *ce) } while (!err); mutex_unlock(&tl->mutex); + /* Wait for all barriers to complete (remote CPU) before we check */ + i915_active_unlock_wait(&ce->active); return err; } diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c index 73243ba59c7d..e73854dd2fe0 100644 --- a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c +++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c @@ -47,7 +47,10 @@ static int pulse_active(struct i915_active *active) static void pulse_free(struct kref *kref) { - kfree(container_of(kref, struct pulse, kref)); + struct pulse *p = container_of(kref, typeof(*p), kref); + + i915_active_fini(&p->active); + kfree(p); } static void pulse_put(struct pulse *p) diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index 3fc5de961280..95d41c01d0e0 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -2729,7 +2729,7 @@ static int create_gang(struct intel_engine_cs *engine, i915_gem_object_put(obj); intel_context_put(ce); - rq->client_link.next = &(*prev)->client_link; + rq->mock.link.next = &(*prev)->mock.link; *prev = rq; return 0; @@ -2970,8 +2970,7 @@ static int live_preempt_gang(void *arg) } while (rq) { /* wait for each rq from highest to lowest prio */ - struct i915_request *n = - list_next_entry(rq, client_link); + struct i915_request *n = list_next_entry(rq, mock.link); if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0) { struct drm_printer p = @@ -3090,7 +3089,7 @@ static struct i915_vma *create_global(struct intel_gt *gt, size_t sz) return vma; } - err = i915_ggtt_pin(vma, 0, 0); + err = i915_ggtt_pin(vma, NULL, 0, 0); if (err) { i915_vma_put(vma); return ERR_PTR(err); @@ -4997,6 +4996,7 @@ static int __live_lrc_state(struct intel_engine_cs *engine, { struct intel_context *ce; struct i915_request *rq; + struct i915_gem_ww_ctx ww; enum { RING_START_IDX = 0, RING_TAIL_IDX, @@ -5011,7 +5011,11 @@ static int __live_lrc_state(struct intel_engine_cs *engine, if (IS_ERR(ce)) return PTR_ERR(ce); - err = intel_context_pin(ce); + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(scratch->obj, &ww); + if (!err) + err = intel_context_pin_ww(ce, &ww); if (err) goto err_put; @@ -5040,11 +5044,9 @@ static int __live_lrc_state(struct intel_engine_cs *engine, *cs++ = i915_ggtt_offset(scratch) + RING_TAIL_IDX * sizeof(u32); *cs++ = 0; - i915_vma_lock(scratch); err = i915_request_await_object(rq, scratch->obj, true); if (!err) err = i915_vma_move_to_active(scratch, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(scratch); i915_request_get(rq); i915_request_add(rq); @@ -5081,6 +5083,12 @@ err_rq: err_unpin: intel_context_unpin(ce); err_put: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); intel_context_put(ce); return err; } diff --git a/drivers/gpu/drm/i915/gt/selftest_rps.c b/drivers/gpu/drm/i915/gt/selftest_rps.c index 8624f5d2a1f3..3540ba9bd459 100644 --- a/drivers/gpu/drm/i915/gt/selftest_rps.c +++ b/drivers/gpu/drm/i915/gt/selftest_rps.c @@ -77,20 +77,20 @@ create_spin_counter(struct intel_engine_cs *engine, vma = i915_vma_instance(obj, vm, NULL); if (IS_ERR(vma)) { - i915_gem_object_put(obj); - return vma; + err = PTR_ERR(vma); + goto err_put; } err = i915_vma_pin(vma, 0, 0, PIN_USER); - if (err) { - i915_vma_put(vma); - return ERR_PTR(err); - } + if (err) + goto err_unlock; + + i915_vma_lock(vma); base = i915_gem_object_pin_map(obj, I915_MAP_WC); if (IS_ERR(base)) { - i915_gem_object_put(obj); - return ERR_CAST(base); + err = PTR_ERR(base); + goto err_unpin; } cs = base; @@ -134,6 +134,14 @@ create_spin_counter(struct intel_engine_cs *engine, *cancel = base + loop; *counter = srm ? memset32(base + end, 0, 1) : NULL; return vma; + +err_unpin: + i915_vma_unpin(vma); +err_unlock: + i915_vma_unlock(vma); +err_put: + i915_gem_object_put(obj); + return ERR_PTR(err); } static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms) @@ -639,7 +647,6 @@ int live_rps_frequency_cs(void *arg) goto err_vma; } - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, false); if (!err) err = i915_vma_move_to_active(vma, rq, 0); @@ -647,7 +654,6 @@ int live_rps_frequency_cs(void *arg) err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0); - i915_vma_unlock(vma); i915_request_add(rq); if (err) goto err_vma; @@ -700,7 +706,7 @@ int live_rps_frequency_cs(void *arg) f = act; /* may skip ahead [pcu granularity] */ } - err = -EINVAL; + err = -EINTR; /* ignore error, continue on with test */ } err_vma: @@ -708,6 +714,7 @@ err_vma: i915_gem_object_flush_map(vma->obj); i915_gem_object_unpin_map(vma->obj); i915_vma_unpin(vma); + i915_vma_unlock(vma); i915_vma_put(vma); st_engine_heartbeat_enable(engine); @@ -781,7 +788,6 @@ int live_rps_frequency_srm(void *arg) goto err_vma; } - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, false); if (!err) err = i915_vma_move_to_active(vma, rq, 0); @@ -789,7 +795,6 @@ int live_rps_frequency_srm(void *arg) err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0); - i915_vma_unlock(vma); i915_request_add(rq); if (err) goto err_vma; @@ -841,7 +846,7 @@ int live_rps_frequency_srm(void *arg) f = act; /* may skip ahead [pcu granularity] */ } - err = -EINVAL; + err = -EINTR; /* ignore error, continue on with test */ } err_vma: @@ -849,6 +854,7 @@ err_vma: i915_gem_object_flush_map(vma->obj); i915_gem_object_unpin_map(vma->obj); i915_vma_unpin(vma); + i915_vma_unlock(vma); i915_vma_put(vma); st_engine_heartbeat_enable(engine); diff --git a/drivers/gpu/drm/i915/gt/selftest_timeline.c b/drivers/gpu/drm/i915/gt/selftest_timeline.c index fb5b7d3498a6..19c2cb166e7c 100644 --- a/drivers/gpu/drm/i915/gt/selftest_timeline.c +++ b/drivers/gpu/drm/i915/gt/selftest_timeline.c @@ -72,7 +72,7 @@ static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state, unsigned long cacheline; int err; - tl = intel_timeline_create(state->gt, NULL); + tl = intel_timeline_create(state->gt); if (IS_ERR(tl)) return PTR_ERR(tl); @@ -158,7 +158,7 @@ out: __mock_hwsp_record(&state, na, NULL); kfree(state.history); err_put: - drm_dev_put(&i915->drm); + mock_destroy_device(i915); return err; } @@ -455,7 +455,7 @@ tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value) struct i915_request *rq; int err; - err = intel_timeline_pin(tl); + err = intel_timeline_pin(tl, NULL); if (err) { rq = ERR_PTR(err); goto out; @@ -487,11 +487,11 @@ checked_intel_timeline_create(struct intel_gt *gt) { struct intel_timeline *tl; - tl = intel_timeline_create(gt, NULL); + tl = intel_timeline_create(gt); if (IS_ERR(tl)) return tl; - if (*tl->hwsp_seqno != tl->seqno) { + if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) { pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n", *tl->hwsp_seqno, tl->seqno); intel_timeline_put(tl); @@ -561,9 +561,9 @@ static int live_hwsp_engine(void *arg) for (n = 0; n < count; n++) { struct intel_timeline *tl = timelines[n]; - if (!err && *tl->hwsp_seqno != n) { - pr_err("Invalid seqno stored in timeline %lu @ %x, found 0x%x\n", - n, tl->hwsp_offset, *tl->hwsp_seqno); + if (!err && READ_ONCE(*tl->hwsp_seqno) != n) { + GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n", + n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno); GEM_TRACE_DUMP(); err = -EINVAL; } @@ -633,9 +633,9 @@ out: for (n = 0; n < count; n++) { struct intel_timeline *tl = timelines[n]; - if (!err && *tl->hwsp_seqno != n) { - pr_err("Invalid seqno stored in timeline %lu @ %x, found 0x%x\n", - n, tl->hwsp_offset, *tl->hwsp_seqno); + if (!err && READ_ONCE(*tl->hwsp_seqno) != n) { + GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n", + n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno); GEM_TRACE_DUMP(); err = -EINVAL; } @@ -660,14 +660,14 @@ static int live_hwsp_wrap(void *arg) * foreign GPU references. */ - tl = intel_timeline_create(gt, NULL); + tl = intel_timeline_create(gt); if (IS_ERR(tl)) return PTR_ERR(tl); if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline) goto out_free; - err = intel_timeline_pin(tl); + err = intel_timeline_pin(tl, NULL); if (err) goto out_free; @@ -733,7 +733,8 @@ static int live_hwsp_wrap(void *arg) goto out; } - if (*hwsp_seqno[0] != seqno[0] || *hwsp_seqno[1] != seqno[1]) { + if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] || + READ_ONCE(*hwsp_seqno[1]) != seqno[1]) { pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n", *hwsp_seqno[0], *hwsp_seqno[1], seqno[0], seqno[1]); @@ -966,9 +967,10 @@ static int live_hwsp_recycle(void *arg) break; } - if (*tl->hwsp_seqno != count) { - pr_err("Invalid seqno stored in timeline %lu @ tl->hwsp_offset, found 0x%x\n", - count, *tl->hwsp_seqno); + if (READ_ONCE(*tl->hwsp_seqno) != count) { + GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n", + count, tl->fence_context, + tl->hwsp_offset, *tl->hwsp_seqno); GEM_TRACE_DUMP(); err = -EINVAL; } diff --git a/drivers/gpu/drm/i915/gt/selftest_workarounds.c b/drivers/gpu/drm/i915/gt/selftest_workarounds.c index febc9e6692ba..61a0532d0f3d 100644 --- a/drivers/gpu/drm/i915/gt/selftest_workarounds.c +++ b/drivers/gpu/drm/i915/gt/selftest_workarounds.c @@ -214,7 +214,7 @@ static int check_whitelist(struct i915_gem_context *ctx, return PTR_ERR(results); err = 0; - i915_gem_object_lock(results); + i915_gem_object_lock(results, NULL); intel_wedge_on_timeout(&wedge, engine->gt, HZ / 5) /* safety net! */ err = i915_gem_object_set_to_cpu_domain(results, false); i915_gem_object_unlock(results); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c index 861657897c0f..942c7c187adb 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c @@ -677,7 +677,7 @@ struct i915_vma *intel_guc_allocate_vma(struct intel_guc *guc, u32 size) goto err; flags = PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma); - ret = i915_ggtt_pin(vma, 0, flags); + ret = i915_ggtt_pin(vma, NULL, 0, flags); if (ret) { vma = ERR_PTR(ret); goto err; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c index 59b27aba15c6..80e8b6c3bc8c 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c @@ -51,8 +51,8 @@ void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw, * Note that RKL uses the same firmware as TGL. */ #define INTEL_UC_FIRMWARE_DEFS(fw_def, guc_def, huc_def) \ - fw_def(ROCKETLAKE, 0, guc_def(tgl, 35, 2, 0), huc_def(tgl, 7, 0, 12)) \ - fw_def(TIGERLAKE, 0, guc_def(tgl, 35, 2, 0), huc_def(tgl, 7, 0, 12)) \ + fw_def(ROCKETLAKE, 0, guc_def(tgl, 35, 2, 0), huc_def(tgl, 7, 5, 0)) \ + fw_def(TIGERLAKE, 0, guc_def(tgl, 35, 2, 0), huc_def(tgl, 7, 5, 0)) \ fw_def(ELKHARTLAKE, 0, guc_def(ehl, 33, 0, 4), huc_def(ehl, 9, 0, 0)) \ fw_def(ICELAKE, 0, guc_def(icl, 33, 0, 0), huc_def(icl, 9, 0, 0)) \ fw_def(COMETLAKE, 5, guc_def(cml, 33, 0, 0), huc_def(cml, 4, 0, 0)) \ |