drm/i915: use a separate context for gpu relocs
The CS pre-parser can pre-fetch commands across memory sync points and
starting from gen12 it is able to pre-fetch across BB_START and BB_END
boundaries as well, so when we emit gpu relocs the pre-parser might
fetch the target location of the reloc before the memory write lands.
The parser can't pre-fetch across the ctx switch, so we use a separate
context to guarantee that the memory is synchronized before the parser
can get to it.
Note that there is no risk of the CS doing a lite restore from the reloc
context to the user context, even if the two have the same hw_id,
because since gen11 the CS also checks the LRCA when deciding if it can
lite-restore.
v2: limit new context to gen12+, release in eb_destroy, add a comment
    in emit_fini_breadcrumb (Chris).
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190827185805.21799-1-daniele.ceraolospurio@intel.com
			
			
This commit is contained in:
		
							parent
							
								
									45e9c829eb
								
							
						
					
					
						commit
						8a9a982767
					
				| @ -252,6 +252,7 @@ struct i915_execbuffer { | ||||
| 		bool has_fence : 1; | ||||
| 		bool needs_unfenced : 1; | ||||
| 
 | ||||
| 		struct intel_context *ce; | ||||
| 		struct i915_request *rq; | ||||
| 		u32 *rq_cmd; | ||||
| 		unsigned int rq_size; | ||||
| @ -880,6 +881,9 @@ static void eb_destroy(const struct i915_execbuffer *eb) | ||||
| { | ||||
| 	GEM_BUG_ON(eb->reloc_cache.rq); | ||||
| 
 | ||||
| 	if (eb->reloc_cache.ce) | ||||
| 		intel_context_put(eb->reloc_cache.ce); | ||||
| 
 | ||||
| 	if (eb->lut_size > 0) | ||||
| 		kfree(eb->buckets); | ||||
| } | ||||
| @ -903,6 +907,7 @@ static void reloc_cache_init(struct reloc_cache *cache, | ||||
| 	cache->has_fence = cache->gen < 4; | ||||
| 	cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment; | ||||
| 	cache->node.allocated = false; | ||||
| 	cache->ce = NULL; | ||||
| 	cache->rq = NULL; | ||||
| 	cache->rq_size = 0; | ||||
| } | ||||
| @ -1168,7 +1173,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb, | ||||
| 	if (err) | ||||
| 		goto err_unmap; | ||||
| 
 | ||||
| 	rq = i915_request_create(eb->context); | ||||
| 	rq = intel_context_create_request(cache->ce); | ||||
| 	if (IS_ERR(rq)) { | ||||
| 		err = PTR_ERR(rq); | ||||
| 		goto err_unpin; | ||||
| @ -1239,6 +1244,29 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb, | ||||
| 		if (!intel_engine_can_store_dword(eb->engine)) | ||||
| 			return ERR_PTR(-ENODEV); | ||||
| 
 | ||||
| 		if (!cache->ce) { | ||||
| 			struct intel_context *ce; | ||||
| 
 | ||||
| 			/*
 | ||||
| 			 * The CS pre-parser can pre-fetch commands across | ||||
| 			 * memory sync points and starting gen12 it is able to | ||||
| 			 * pre-fetch across BB_START and BB_END boundaries | ||||
| 			 * (within the same context). We therefore use a | ||||
| 			 * separate context gen12+ to guarantee that the reloc | ||||
| 			 * writes land before the parser gets to the target | ||||
| 			 * memory location. | ||||
| 			 */ | ||||
| 			if (cache->gen >= 12) | ||||
| 				ce = intel_context_create(eb->context->gem_context, | ||||
| 							  eb->engine); | ||||
| 			else | ||||
| 				ce = intel_context_get(eb->context); | ||||
| 			if (IS_ERR(ce)) | ||||
| 				return ERR_CAST(ce); | ||||
| 
 | ||||
| 			cache->ce = ce; | ||||
| 		} | ||||
| 
 | ||||
| 		err = __reloc_gpu_alloc(eb, vma, len); | ||||
| 		if (unlikely(err)) | ||||
| 			return ERR_PTR(err); | ||||
|  | ||||
| @ -2931,6 +2931,24 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) | ||||
| 	return gen8_emit_fini_breadcrumb_footer(request, cs); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Note that the CS instruction pre-parser will not stall on the breadcrumb | ||||
|  * flush and will continue pre-fetching the instructions after it before the | ||||
|  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at | ||||
|  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble | ||||
|  * of the next request before the memory has been flushed, we're guaranteed that | ||||
|  * we won't access the batch itself too early. | ||||
|  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, | ||||
|  * so, if the current request is modifying an instruction in the next request on | ||||
|  * the same intel_context, we might pre-fetch and then execute the pre-update | ||||
|  * instruction. To avoid this, the users of self-modifying code should either | ||||
|  * disable the parser around the code emitting the memory writes, via a new flag | ||||
|  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For | ||||
|  * the in-kernel use-cases we've opted to use a separate context, see | ||||
|  * reloc_gpu() as an example. | ||||
|  * All the above applies only to the instructions themselves. Non-inline data | ||||
|  * used by the instructions is not pre-fetched. | ||||
|  */ | ||||
| static u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, | ||||
| 					   u32 *cs) | ||||
| { | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user