diff --git a/README.md b/README.md index 7f9989c..f86405d 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ To run the code, the following software must be available: - Python 3.6.x or newer. - The `virtualenv` module (installed with `pip install virtualenv`). - A C++17-capable compiler (e.g., GCC 7.x or Clang 6.x). +- graphviz (for graph plotting support) - One or both FPGA compilers: - Intel FPGA OpenCL SDK (tested with 18.1.1 and 19.1) - Xilinx Vitis (tested with 2020.2) @@ -47,6 +48,13 @@ kernel source files themselves in: .dacecache//src/intel_fpga/device ``` +To run low-level analysis of the buffer size and stencil program visualization, you can invoke the executable `stencilflow/kernel_chain_graph.py`. +Example usage: + +```bash +stencilflow/kernel_chain_graph.py -stencil_file test/stencils/jacobi3d_32x32x32_8itr_8vec.json -plot -simulate -report -optimize +``` + Verification ------------ @@ -81,3 +89,16 @@ It is a known issue that launching multiple Intel FPGA kernels in quick succession (such as is done in the tests) can sometimes fail sporadically, seemingly due to file I/O issues. Running individual programs should never fail. +Publication +----------- + +If you use StencilFlow, cite us: +```bibtex +@inproceedings{dace, + author = {Johannes de~Fine~Licht, Andreas Kuster, Tiziano De~Matteis, Tal Ben-Nun, Dominic Hofer, Torsten Hoefler}, + title = {StencilFlow: Mapping Large Stencil Programs to Distributed Spatial Computing Systems}, + year = {2021}, + booktitle = {Proceedings of the IEEE/ACM International Symposium on Code Generation and Optimization (CGO'21)}, + series = {CGO '21} +} +``` \ No newline at end of file diff --git a/dace b/dace index 1fc6ddd..e732b1d 160000 --- a/dace +++ b/dace @@ -1 +1 @@ -Subproject commit 1fc6dddd94ee7fd467f1802398f4dad778c9a68a +Subproject commit e732b1d7ff83debeac9c7075f9ec78f4d5facc05 diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py index ff54023..8aad222 100644 --- a/stencilflow/kernel_chain_graph.py +++ b/stencilflow/kernel_chain_graph.py @@ -15,6 +15,9 @@ import operator import re import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) from typing import Any, List, Dict, Tuple @@ -289,14 +292,14 @@ def add_channels(self) -> None: name = src.name + "_" + dest.name channel = { "name": - name, + name, "delay_buffer": - self.kernel_nodes[dest.name].delay_buffer[ - src.name], + self.kernel_nodes[dest.name].delay_buffer[ + src.name], "internal_buffer": - dest.internal_buffer[src.name], + dest.internal_buffer[src.name], "data_type": - src.data_type + src.data_type } # add channel reference to global channel dictionary self.channels[name] = channel @@ -314,18 +317,18 @@ def add_channels(self) -> None: name = src.name + "_" + dest.name channel = { "name": - name, + name, "delay_buffer": - self.kernel_nodes[dest.name].delay_buffer[ - src.name], + self.kernel_nodes[dest.name].delay_buffer[ + src.name], "internal_buffer": - dest.internal_buffer[src.name], + dest.internal_buffer[src.name], "data_type": - src.data_type, + src.data_type, "input_dims": - self.inputs[src.name]["input_dims"] - if "input_dims" in self.inputs[src.name] - else None + self.inputs[src.name]["input_dims"] + if "input_dims" in self.inputs[src.name] + else None } # add channel reference to global channel dictionary self.channels[name] = channel @@ -342,13 +345,13 @@ def add_channels(self) -> None: name = src.name + "_" + dest.name channel = { "name": - name, + name, "delay_buffer": - self.output_nodes[dest.name].delay_buffer[ - src.name], + self.output_nodes[dest.name].delay_buffer[ + src.name], "internal_buffer": {}, "data_type": - src.data_type + src.data_type } # add channel reference to global channel dictionary self.channels[name] = channel @@ -386,7 +389,7 @@ def import_input(self) -> None: else: i["input_dims"] = stencilflow.ITERATORS[len(stencilflow. ITERATORS) - - self.kernel_dimensions:] + self.kernel_dimensions:] self.outputs = inp["outputs"] # handle stencil program output dimensions if self.kernel_dimensions == 1: # 1D @@ -394,8 +397,8 @@ def import_input(self) -> None: self.program[entry]["computation_string"] = \ self.program[entry]["computation_string"].replace("[", "[i, j,") # add two extra indices self.dimensions = [ - 1, 1 - ] + inp["dimensions"] # add two extra dimensions + 1, 1 + ] + inp["dimensions"] # add two extra dimensions elif self.kernel_dimensions == 2: # 2D for entry in self.program: self.program[entry]["computation_string"] = self.program[entry]["computation_string"] \ @@ -489,28 +492,37 @@ def compute_delay_buffer(self) -> None: order = list(nx.topological_sort(self.graph)) except nx.exception.NetworkXUnfeasible: cycle = next(nx.algorithms.cycles.simple_cycles(self.graph)) - raise ValueError("Cycle detected: {}".format( - [c.name for c in cycle])) + raise ValueError("Cycle detected: {}".format([c.name for c in cycle])) # go through all nodes for node in order: # process delay buffer (no additional delay buffer will appear because of the topological order) for inp in node.input_paths: + + # add internal buffer latency for internal computation + if not isinstance(node, Output): + for entry in node.input_paths[inp]: + name = entry[-1] + entry[2] += node.dist_to_center[name] + # compute maximum delay size per input max_delay = max(node.input_paths[inp]) - max_delay[ - 2] += 1 # add an extra delay cycle for the processing in the kernel node + max_delay[2] += 1 # add an extra delay cycle for the processing in the kernel node # loop over all inputs and set their size relative to the max size to have data ready at the exact # same time for entry in node.input_paths[inp]: name = entry[-1] max_size = stencilflow.convert_3d_to_1d( dimensions=self.dimensions, - index=stencilflow.list_subtract_cwise( - max_delay[:-1], entry[:-1])) - node.delay_buffer[name] = BoundedQueue(name=name, - maxsize=max_size) - node.delay_buffer[name].import_data( - [None] * node.delay_buffer[name].maxsize) + index=stencilflow.list_subtract_cwise(max_delay[:-1], entry[:-1])) + node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size) + node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize) + + # remove internal buffer latency for internal computation + if not isinstance(node, Output): + for entry in node.input_paths[inp]: + name = entry[-1] + entry[2] -= node.dist_to_center[name] + # set input node delay buffers to 1 if isinstance(node, Input): node.delay_buffer = BoundedQueue(name=node.name, @@ -716,7 +728,7 @@ def report(self, name): u.name, v.name, entry.name, entry.maxsize)) total_fast += entry.maxsize print("buffer size slow memory: {} \nbuffer size fast memory: {}".format( - total_slow, total_fast)) + total_slow, total_fast)) def operation_count(self): """For each operation type found in the ASTs, return a tuple of @@ -789,6 +801,14 @@ def runtime_lower_bound(self): type=int) parser.add_argument("-report", action="store_true") parser.add_argument("-simulate", action="store_true") + parser.add_argument("-opt", action="store_true") + parser.add_argument("-opt_goal", default=["min_fast_mem", 12000], nargs="+") + """ + choices: + - min_com_vol, FAST_MEM_BOUND, SLOW_MEM_BOUND + - min_fast_mem, COM_VOL_BOUND + - opt_ratio, RATIO + """ args = parser.parse_args() args.log_level = stencilflow.log_level.LogLevel(args.log_level) program_description = stencilflow.parse_json(args.stencil_file) @@ -809,6 +829,18 @@ def runtime_lower_bound(self): log_level=LogLevel(args.log_level)) sim.simulate() + # choose optimization goal + if args.opt: + from stencilflow import Optimizer + + opt = Optimizer(self.kernel_nodes, self.dimensions) + if args.opt_goal[0] == "min_com_vol": + opt.minimize_comm_vol(fast_memory_bound=args.opt_goal[1], slow_memory_bound=args.opt_goal[2]) + if args.opt_goal[0] == "min_fast_mem": + opt.minimize_fast_mem(communication_volume_bound=args.opt_goal[1]) + if args.opt_goal[0] == "opt_ratio": + opt.optimize_to_ratio(ratio=args.opt_goal[1]) + # output a report if argument -report is true if args.report: chain.report(args.stencil_file) diff --git a/stencilflow/sdfg_generator.py b/stencilflow/sdfg_generator.py index 22629b4..87e54e5 100644 --- a/stencilflow/sdfg_generator.py +++ b/stencilflow/sdfg_generator.py @@ -28,7 +28,7 @@ import networkx as nx -MINIMUM_CHANNEL_DEPTH = 2048 +MINIMUM_CHANNEL_DEPTH = 1024 NUM_BANKS = 4 diff --git a/test/stencils/horidiff_min.json b/test/stencils/horidiff_min.json new file mode 100644 index 0000000..cf2679b --- /dev/null +++ b/test/stencils/horidiff_min.json @@ -0,0 +1,84 @@ +{ + "inputs": { + "inA": { + "data": "inA_float32.dat", + "data_type": "float32", + "input_dims": [ + "i" + ] + } + }, + "outputs": [ + "out" + ], + "dimensions": [ + 10, + 10, + 10 + ], + "vectorization": 1, + "program": { + "k0": { + "data_type": "float32", + "computation_string": "k0 = inA[i]", + "boundary_conditions": { + "inA": { + "type": "constant", + "value": 0.0 + } + } + }, + "k1": { + "data_type": "float32", + "computation_string": "k1 = inA[i]", + "boundary_conditions": { + "inA": { + "type": "constant", + "value": 0.0 + } + } + }, + "k2": { + "data_type": "float32", + "computation_string": "k2 = k1[i, j, k] + k0[i+1, j, k] + k0[i, j, k]", + "boundary_conditions": { + "k1": { + "type": "constant", + "value": 0.0 + }, + "k0": { + "type": "constant", + "value": 0.0 + } + } + }, + "k3": { + "data_type": "float32", + "computation_string": "k3 = k0[i, j, k] + k1[i+1, j+1, k+1] + k1[i, j, k]", + "boundary_conditions": { + "k0": { + "type": "constant", + "value": 0.0 + }, + "k1": { + "type": "constant", + "value": 0.0 + } + } + }, + "out": { + "data_type": "float32", + "computation_string": "out = k2[i, j, k] + k3[i, j, k]", + "boundary_conditions": { + "k2":{ + "type": "constant", + "value": 0.0 + }, + "k3": { + "type": "constant", + "value": 0.0 + } + } + } + } +} diff --git a/test/stencils/horidiff_min_ext.json b/test/stencils/horidiff_min_ext.json new file mode 100644 index 0000000..607ff61 --- /dev/null +++ b/test/stencils/horidiff_min_ext.json @@ -0,0 +1,94 @@ +{ + "inputs": { + "inA": { + "data": "inA_float32.dat", + "data_type": "float32", + "input_dims": [ + "i" + ] + } + }, + "outputs": [ + "out" + ], + "dimensions": [ + 10, + 10, + 10 + ], + "vectorization": 1, + "program": { + "k0": { + "data_type": "float32", + "computation_string": "k0 = inA[i]", + "boundary_conditions": { + "inA": { + "type": "constant", + "value": 0.0 + } + } + }, + "k1": { + "data_type": "float32", + "computation_string": "k1 = inA[i]", + "boundary_conditions": { + "inA": { + "type": "constant", + "value": 0.0 + } + } + }, + "k2": { + "data_type": "float32", + "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]", + "boundary_conditions": { + "k1": { + "type": "constant", + "value": 0.0 + }, + "k0": { + "type": "constant", + "value": 0.0 + } + } + }, + "k3": { + "data_type": "float32", + "computation_string": "k3 = k0[i, j, k] + k4[i+1, j+1, k+1] + k4[i, j, k]", + "boundary_conditions": { + "k0": { + "type": "constant", + "value": 0.0 + }, + "k4": { + "type": "constant", + "value": 0.0 + } + } + }, + "k4": { + "data_type": "float32", + "computation_string": "k4 = k1[i, j, k] + k1[i+1, j+1, k+1]", + "boundary_conditions": { + "k1": { + "type": "constant", + "value": 0.0 + } + } + }, + "out": { + "data_type": "float32", + "computation_string": "out = k2[i,j,k] + k3[i,j,k]", + "boundary_conditions": { + "k2":{ + "type": "constant", + "value": 0.0 + }, + "k3": { + "type": "constant", + "value": 0.0 + } + } + } + } +} diff --git a/test/stencils/jacobi3d_512x512x512.json b/test/stencils/jacobi3d_512x512x512.json new file mode 100644 index 0000000..82db32d --- /dev/null +++ b/test/stencils/jacobi3d_512x512x512.json @@ -0,0 +1,24 @@ +{ + "inputs": { + "a": { + "data": "data/zeros_32x32x32_fp32.dat", + "data_type": "float32" + } + }, + "outputs": ["b"], + "dimensions": [512, 512, 512], + "program": { + "b": { + "computation_string": + "b = 0.16666666 * (a[i-1,j,k] + a[i+1,j,k] + a[i,j-1,k] + a[i,j+1,k] + a[i,j,k-1] + a[i,j,k+1])", + "boundary_conditions": { + "a": { + "type": "constant", + "value": 1.0 + } + }, + "data_type": + "float32" + } + } +}