Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better default schedule/storage type inference #1241

Draft
wants to merge 28 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
af450d2
Refactor calls to schedule inference
tbennun Apr 18, 2023
96cd8af
Fix bug in nested schedule inference
tbennun Apr 18, 2023
5623322
Add test
tbennun Apr 20, 2023
d60d2e3
Make sequential_innermaps default to True
tbennun Apr 21, 2023
9f93016
Revamp schedule/storage type inference by using surrounding storages …
tbennun Apr 21, 2023
f5e5802
Fix argument usage
tbennun Apr 21, 2023
5a2564d
Allow persistent register allocation and improve tests
tbennun May 1, 2023
a6cfc73
More schedule/storage inference tests
tbennun May 1, 2023
add2a8f
Remove schedule freeze upon library node expansion
tbennun May 1, 2023
0f1de9b
Improve nested storage inference and add default top-level schedule/s…
tbennun May 1, 2023
bbd105f
GPU device has GPU_ThreadBlock as its default nested schedule
tbennun May 1, 2023
e204ff1
Remove `GPU_Default` schedule and add a `device` property instead
tbennun May 1, 2023
4a21a25
Fix bug in test and type inference
tbennun May 2, 2023
adfd3b6
Merge remote-tracking branch 'origin/master' into default-schedule-re…
tbennun May 2, 2023
c4f17d1
Fix usage of schedule/storage inference and remove it from inlining
tbennun May 3, 2023
bf77137
Fix dispatching of nested SDFGs with a GPU_Device schedule
tbennun May 3, 2023
26906dd
Fix FPGA dispatching with scalars
tbennun May 7, 2023
b3634b0
Take storage from parent SDFG for inter-state-edge-only arrays
tbennun May 7, 2023
e2d7c22
Fix CUDA codegen preprocessing w.r.t. streams
tbennun May 7, 2023
09d04da
GPU codegen: Switch to kernel schedule rather than an ambiguous value
tbennun May 7, 2023
11483ad
Merge branch 'master' into default-schedule-revamp
tbennun May 7, 2023
5d54ea4
Reduce eagerness of FPGA code generator
tbennun May 7, 2023
9783bb7
Fix dangling connector in gemm
tbennun May 7, 2023
c8dd4b5
Merge branch 'default-schedule-revamp' of github.com:spcl/dace into d…
tbennun May 7, 2023
d6574f7
improve host-allocated storage detection
tbennun May 7, 2023
894c280
Fix GEMM connector issue
tbennun May 7, 2023
da06520
GPU codegen: improve in-device / in-threadblock testing
tbennun May 7, 2023
6b77357
Merge branch 'master' into default-schedule-revamp
tbennun Jun 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dace/codegen/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,14 @@ def generate_code(sdfg, validate=True) -> List[CodeObject]:
infer_types.infer_connector_types(sdfg)

# Set default storage/schedule types in SDFG
infer_types.set_default_schedule_and_storage_types(sdfg, None)
infer_types.set_default_schedule_and_storage_types(sdfg)

# Recursively expand library nodes that have not yet been expanded
sdfg.expand_library_nodes()

# After expansion, run another pass of connector/type inference
infer_types.infer_connector_types(sdfg)
infer_types.set_default_schedule_and_storage_types(sdfg, None)
infer_types.set_default_schedule_and_storage_types(sdfg)

frame = framecode.DaCeCodeGenerator(sdfg)

Expand Down
2 changes: 1 addition & 1 deletion dace/codegen/instrumentation/likwid.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ class LIKWIDInstrumentationGPU(InstrumentationProvider):
the Likwid tool.
"""

perf_whitelist_schedules = [dtypes.ScheduleType.GPU_Default, dtypes.ScheduleType.GPU_Device]
perf_whitelist_schedules = [dtypes.ScheduleType.GPU_Device]

def __init__(self):
self._likwid_used = False
Expand Down
39 changes: 20 additions & 19 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from dace.frontend import operations
from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, dynamic_map_inputs, has_dynamic_map_inputs,
is_array_stream_view, is_devicelevel_gpu, nodes, scope_contains_scope)
from dace.sdfg import utils as sdutil
from dace.sdfg import scope as sdscope, utils as sdutil
from dace.transformation import helpers as xfh
from dace.transformation.passes import analysis as ap

Expand Down Expand Up @@ -80,7 +80,7 @@ def __init__(self, frame_codegen, sdfg: SDFG):
self._initcode = CodeIOStream()
self._exitcode = CodeIOStream()
self._global_sdfg: SDFG = sdfg
self._toplevel_schedule = None
self._kernel_schedule = None
self._arglists: Dict[nodes.MapEntry, Dict[str, dt.Data]] = {}

# Keep track of current "scope entry/exit" code streams for extra
Expand Down Expand Up @@ -436,14 +436,14 @@ def get_generated_codeobjects(self):

def node_dispatch_predicate(self, sdfg, state, node):
if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes
if node.schedule in dtypes.GPU_SCHEDULES:
if not isinstance(node, nodes.NestedSDFG) and node.schedule in dtypes.GPU_SCHEDULES:
return True
if isinstance(node, nodes.NestedSDFG) and CUDACodeGen._in_device_code:
return True
return False

def state_dispatch_predicate(self, sdfg, state):
if self._toplevel_schedule in dtypes.GPU_SCHEDULES:
if self._kernel_schedule in dtypes.GPU_SCHEDULES:
return True
for node in state.sink_nodes():
if hasattr(node, '_cuda_stream'):
Expand Down Expand Up @@ -808,9 +808,7 @@ def increment(streams):
if not hasattr(e.dst, '_cs_childpath'):
e.dst._cs_childpath = False
if isinstance(e.dst, nodes.NestedSDFG):
if e.dst.schedule not in dtypes.GPU_SCHEDULES:
max_streams, max_events = self._compute_cudastreams(e.dst.sdfg, e.dst._cuda_stream,
max_events + 1)
max_streams, max_events = self._compute_cudastreams(e.dst.sdfg, e.dst._cuda_stream, max_events + 1)

state_streams.append(max_streams if concurrent_streams == 0 else concurrent_streams)
state_subsdfg_events.append(max_events)
Expand Down Expand Up @@ -1262,21 +1260,25 @@ def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stre

# Special case: if this is a GPU grid state and something is reading
# from a possible result of a collaborative write, sync first
if self._toplevel_schedule == dtypes.ScheduleType.GPU_Device:
state_id = next(i for i, s in enumerate(sdfg.nodes()) if s == state)
for node in state.nodes():
if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage == dtypes.StorageType.GPU_Shared
and state.in_degree(node) == 0 and state.out_degree(node) > 0):
if not self._scope_has_collaborative_copy:
callsite_stream.write('__syncthreads();', sdfg, state_id)
break
if self._kernel_schedule == dtypes.ScheduleType.GPU_Device:
# If we are inside a thread-block map, we do not need to synchronize
if not sdscope.is_in_scope(
sdfg, state, None,
[dtypes.ScheduleType.GPU_ThreadBlock, dtypes.ScheduleType.GPU_ThreadBlock_Dynamic]):
state_id = next(i for i, s in enumerate(sdfg.nodes()) if s == state)
for node in state.nodes():
if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage == dtypes.StorageType.GPU_Shared
and state.in_degree(node) == 0 and state.out_degree(node) > 0):
if not self._scope_has_collaborative_copy:
callsite_stream.write('__syncthreads();', sdfg, state_id)
break

# In GPU_Persistent scopes, states need global barriers between them,
# the DFGs inside of a state are independent, so they don't need
# synchronization. DFGs in a GPU_Persistent scope are per se executed
# by a single thread only. (Device) Maps however can be distributed
# across multiple threads
elif self._toplevel_schedule == dtypes.ScheduleType.GPU_Persistent:
elif self._kernel_schedule == dtypes.ScheduleType.GPU_Persistent:

# reset streams in GPU persistent maps if the lifetime is scope,
# otherwise streams do not behave as expected becasue they are
Expand Down Expand Up @@ -1884,6 +1886,7 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
# Dispatch internal code
assert CUDACodeGen._in_device_code is False
CUDACodeGen._in_device_code = True
self._kernel_schedule = node.schedule
self._kernel_map = node
self._kernel_state = sdfg.node(state_id)
self._block_dims = block_dims
Expand Down Expand Up @@ -1933,6 +1936,7 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
self._block_dims = None
self._kernel_map = None
self._kernel_state = None
self._kernel_schedule = None
CUDACodeGen._in_device_code = False
self._grid_dims = None

Expand Down Expand Up @@ -2421,15 +2425,12 @@ def generate_nsdfg_arguments(self, sdfg, dfg, state, node):
return result

def _generate_NestedSDFG(self, sdfg, dfg, state_id, node, function_stream, callsite_stream):
old_schedule = self._toplevel_schedule
self._toplevel_schedule = node.schedule
old_codegen = self._cpu_codegen.calling_codegen
self._cpu_codegen.calling_codegen = self

self._cpu_codegen._generate_NestedSDFG(sdfg, dfg, state_id, node, function_stream, callsite_stream)

self._cpu_codegen.calling_codegen = old_codegen
self._toplevel_schedule = old_schedule

def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite_stream):
if node.map.schedule == dtypes.ScheduleType.GPU_Device:
Expand Down
58 changes: 26 additions & 32 deletions dace/codegen/targets/framecode.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np

import dace
from dace import config, data, dtypes
from dace import config, data, dtypes, symbolic
from dace.cli import progress
from dace.codegen import control_flow as cflow
from dace.codegen import dispatcher as disp
Expand Down Expand Up @@ -414,45 +414,30 @@ def dispatch_state(state: SDFGState) -> str:

return states_generated

def _get_schedule(self, scope: Union[nodes.EntryNode, SDFGState, SDFG]) -> dtypes.ScheduleType:
TOP_SCHEDULE = dtypes.ScheduleType.Sequential
if scope is None:
return TOP_SCHEDULE
elif isinstance(scope, nodes.EntryNode):
return scope.schedule
elif isinstance(scope, (SDFGState, SDFG)):
sdfg: SDFG = (scope if isinstance(scope, SDFG) else scope.parent)
if sdfg.parent_nsdfg_node is None:
return TOP_SCHEDULE

# Go one SDFG up
pstate = sdfg.parent
pscope = pstate.entry_node(sdfg.parent_nsdfg_node)
if pscope is not None:
return self._get_schedule(pscope)
return self._get_schedule(pstate)
else:
raise TypeError

def _can_allocate(self, sdfg: SDFG, state: SDFGState, desc: data.Data, scope: Union[nodes.EntryNode, SDFGState,
SDFG]) -> bool:
schedule = self._get_schedule(scope)
# if not dtypes.can_allocate(desc.storage, schedule):
# return False
if dtypes.can_allocate(desc.storage, schedule):
return True

# Check for device-level memory recursively
node = scope if isinstance(scope, nodes.EntryNode) else None
cstate = scope if isinstance(scope, SDFGState) else state
csdfg = scope if isinstance(scope, SDFG) else sdfg

if isinstance(node, nodes.EntryNode) and node.schedule == dtypes.ScheduleType.GPU_Device:
in_gpu = True
else:
in_gpu = sdscope.is_devicelevel_gpu(csdfg, cstate, node)
if isinstance(node, nodes.EntryNode) and node.schedule == dtypes.ScheduleType.FPGA_Device:
in_fpga = True
else:
in_fpga = sdscope.is_devicelevel_fpga(csdfg, cstate, node)

if desc.storage in dtypes.FPGA_STORAGES:
return sdscope.is_devicelevel_fpga(csdfg, cstate, node)
return in_fpga
elif desc.storage in dtypes.GPU_STORAGES:
return sdscope.is_devicelevel_gpu(csdfg, cstate, node)
return in_gpu
elif desc.storage in dtypes.HOST_ALLOCATED_STORAGES:
return not (in_gpu or in_fpga)

return False
return True

def determine_allocation_lifetime(self, top_sdfg: SDFG):
"""
Expand Down Expand Up @@ -526,8 +511,17 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
continue

definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';'

if desc.storage != dtypes.StorageType.CPU_ThreadLocal: # If thread-local, skip struct entry
arrsize = desc.total_size
arrsize_bytes = 0
if not isinstance(desc.dtype, dtypes.opaque):
arrsize_bytes = arrsize * desc.dtype.bytes

# Special case for registers with static size
if (desc.storage == dtypes.StorageType.Register
and not ((symbolic.issymbolic(arrsize, sdfg.constants)) or
((arrsize_bytes > config.Config.get("compiler", "max_stack_array_size")) == True))):
self.statestruct.append(f'{desc.dtype.ctype} __{sdfg.sdfg_id}_{name}[{sym2cpp(arrsize)}];')
elif desc.storage != dtypes.StorageType.CPU_ThreadLocal: # If thread-local, skip struct entry
self.statestruct.append(definition)

self.to_allocate[top_sdfg].append((sdfg, first_state_instance, first_node_instance, True, True, True))
Expand Down
Loading