diff --git a/README.md b/README.md
index 7f9989c..f86405d 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ To run the code, the following software must be available:
 - Python 3.6.x or newer.
 - The `virtualenv` module (installed with `pip install virtualenv`).
 - A C++17-capable compiler (e.g., GCC 7.x or Clang 6.x).
+- graphviz (for graph plotting support)
 - One or both FPGA compilers:
   - Intel FPGA OpenCL SDK (tested with 18.1.1 and 19.1)
   - Xilinx Vitis (tested with 2020.2) 
@@ -47,6 +48,13 @@ kernel source files themselves in:
 .dacecache/<kernel name>/src/intel_fpga/device
 ```
 
+To run low-level analysis of the buffer size and stencil program visualization, you can invoke the executable `stencilflow/kernel_chain_graph.py`.
+Example usage:
+
+```bash
+stencilflow/kernel_chain_graph.py -stencil_file test/stencils/jacobi3d_32x32x32_8itr_8vec.json -plot -simulate -report -optimize
+```
+
 Verification
 ------------
 
@@ -81,3 +89,16 @@ It is a known issue that launching multiple Intel FPGA kernels in quick
 succession (such as is done in the tests) can sometimes fail sporadically,
 seemingly due to file I/O issues. Running individual programs should never fail.
 
+Publication
+-----------
+
+If you use StencilFlow, cite us:
+```bibtex
+@inproceedings{dace,
+  author    = {Johannes de~Fine~Licht, Andreas Kuster, Tiziano De~Matteis, Tal Ben-Nun, Dominic Hofer, Torsten Hoefler},
+  title     = {StencilFlow: Mapping Large Stencil Programs to Distributed Spatial Computing Systems},
+  year      = {2021},
+  booktitle = {Proceedings of the IEEE/ACM International Symposium on Code Generation and Optimization (CGO'21)},
+  series = {CGO '21}
+}
+```
\ No newline at end of file
diff --git a/dace b/dace
index 1fc6ddd..e732b1d 160000
--- a/dace
+++ b/dace
@@ -1 +1 @@
-Subproject commit 1fc6dddd94ee7fd467f1802398f4dad778c9a68a
+Subproject commit e732b1d7ff83debeac9c7075f9ec78f4d5facc05
diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py
index ff54023..8aad222 100644
--- a/stencilflow/kernel_chain_graph.py
+++ b/stencilflow/kernel_chain_graph.py
@@ -15,6 +15,9 @@
 import operator
 import re
 import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 
 from typing import Any, List, Dict, Tuple
 
@@ -289,14 +292,14 @@ def add_channels(self) -> None:
                                 name = src.name + "_" + dest.name
                                 channel = {
                                     "name":
-                                    name,
+                                        name,
                                     "delay_buffer":
-                                    self.kernel_nodes[dest.name].delay_buffer[
-                                        src.name],
+                                        self.kernel_nodes[dest.name].delay_buffer[
+                                            src.name],
                                     "internal_buffer":
-                                    dest.internal_buffer[src.name],
+                                        dest.internal_buffer[src.name],
                                     "data_type":
-                                    src.data_type
+                                        src.data_type
                                 }
                                 # add channel reference to global channel dictionary
                                 self.channels[name] = channel
@@ -314,18 +317,18 @@ def add_channels(self) -> None:
                                 name = src.name + "_" + dest.name
                                 channel = {
                                     "name":
-                                    name,
+                                        name,
                                     "delay_buffer":
-                                    self.kernel_nodes[dest.name].delay_buffer[
-                                        src.name],
+                                        self.kernel_nodes[dest.name].delay_buffer[
+                                            src.name],
                                     "internal_buffer":
-                                    dest.internal_buffer[src.name],
+                                        dest.internal_buffer[src.name],
                                     "data_type":
-                                    src.data_type,
+                                        src.data_type,
                                     "input_dims":
-                                    self.inputs[src.name]["input_dims"]
-                                    if "input_dims" in self.inputs[src.name]
-                                    else None
+                                        self.inputs[src.name]["input_dims"]
+                                        if "input_dims" in self.inputs[src.name]
+                                        else None
                                 }
                                 # add channel reference to global channel dictionary
                                 self.channels[name] = channel
@@ -342,13 +345,13 @@ def add_channels(self) -> None:
                             name = src.name + "_" + dest.name
                             channel = {
                                 "name":
-                                name,
+                                    name,
                                 "delay_buffer":
-                                self.output_nodes[dest.name].delay_buffer[
-                                    src.name],
+                                    self.output_nodes[dest.name].delay_buffer[
+                                        src.name],
                                 "internal_buffer": {},
                                 "data_type":
-                                src.data_type
+                                    src.data_type
                             }
                             # add channel reference to global channel dictionary
                             self.channels[name] = channel
@@ -386,7 +389,7 @@ def import_input(self) -> None:
                 else:
                     i["input_dims"] = stencilflow.ITERATORS[len(stencilflow.
                                                                 ITERATORS) -
-                                                        self.kernel_dimensions:]
+                                                            self.kernel_dimensions:]
         self.outputs = inp["outputs"]
         # handle stencil program output dimensions
         if self.kernel_dimensions == 1:  # 1D
@@ -394,8 +397,8 @@ def import_input(self) -> None:
                 self.program[entry]["computation_string"] = \
                     self.program[entry]["computation_string"].replace("[", "[i, j,")  # add two extra indices
             self.dimensions = [
-                1, 1
-            ] + inp["dimensions"]  # add two extra dimensions
+                                  1, 1
+                              ] + inp["dimensions"]  # add two extra dimensions
         elif self.kernel_dimensions == 2:  # 2D
             for entry in self.program:
                 self.program[entry]["computation_string"] = self.program[entry]["computation_string"] \
@@ -489,28 +492,37 @@ def compute_delay_buffer(self) -> None:
             order = list(nx.topological_sort(self.graph))
         except nx.exception.NetworkXUnfeasible:
             cycle = next(nx.algorithms.cycles.simple_cycles(self.graph))
-            raise ValueError("Cycle detected: {}".format(
-                [c.name for c in cycle]))
+            raise ValueError("Cycle detected: {}".format([c.name for c in cycle]))
         # go through all nodes
         for node in order:
             # process delay buffer (no additional delay buffer will appear because of the topological order)
             for inp in node.input_paths:
+
+                # add internal buffer latency for internal computation
+                if not isinstance(node, Output):
+                    for entry in node.input_paths[inp]:
+                        name = entry[-1]
+                        entry[2] += node.dist_to_center[name]
+
                 # compute maximum delay size per input
                 max_delay = max(node.input_paths[inp])
-                max_delay[
-                    2] += 1  # add an extra delay cycle for the processing in the kernel node
+                max_delay[2] += 1  # add an extra delay cycle for the processing in the kernel node
                 # loop over all inputs and set their size relative to the max size to have data ready at the exact
                 # same time
                 for entry in node.input_paths[inp]:
                     name = entry[-1]
                     max_size = stencilflow.convert_3d_to_1d(
                         dimensions=self.dimensions,
-                        index=stencilflow.list_subtract_cwise(
-                            max_delay[:-1], entry[:-1]))
-                    node.delay_buffer[name] = BoundedQueue(name=name,
-                                                           maxsize=max_size)
-                    node.delay_buffer[name].import_data(
-                        [None] * node.delay_buffer[name].maxsize)
+                        index=stencilflow.list_subtract_cwise(max_delay[:-1], entry[:-1]))
+                    node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                    node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+
+                # remove internal buffer latency for internal computation
+                if not isinstance(node, Output):
+                    for entry in node.input_paths[inp]:
+                        name = entry[-1]
+                        entry[2] -= node.dist_to_center[name]
+
             # set input node delay buffers to 1
             if isinstance(node, Input):
                 node.delay_buffer = BoundedQueue(name=node.name,
@@ -716,7 +728,7 @@ def report(self, name):
                         u.name, v.name, entry.name, entry.maxsize))
                     total_fast += entry.maxsize
         print("buffer size slow memory: {} \nbuffer size fast memory: {}".format(
-                total_slow, total_fast))
+            total_slow, total_fast))
 
     def operation_count(self):
         """For each operation type found in the ASTs, return a tuple of
@@ -789,6 +801,14 @@ def runtime_lower_bound(self):
                         type=int)
     parser.add_argument("-report", action="store_true")
     parser.add_argument("-simulate", action="store_true")
+    parser.add_argument("-opt", action="store_true")
+    parser.add_argument("-opt_goal", default=["min_fast_mem", 12000], nargs="+")
+    """
+        choices:
+        - min_com_vol, FAST_MEM_BOUND, SLOW_MEM_BOUND
+        - min_fast_mem, COM_VOL_BOUND
+        - opt_ratio, RATIO
+    """
     args = parser.parse_args()
     args.log_level = stencilflow.log_level.LogLevel(args.log_level)
     program_description = stencilflow.parse_json(args.stencil_file)
@@ -809,6 +829,18 @@ def runtime_lower_bound(self):
                         log_level=LogLevel(args.log_level))
         sim.simulate()
 
+    # choose optimization goal
+    if args.opt:
+        from stencilflow import Optimizer
+
+        opt = Optimizer(self.kernel_nodes, self.dimensions)
+        if args.opt_goal[0] == "min_com_vol":
+            opt.minimize_comm_vol(fast_memory_bound=args.opt_goal[1], slow_memory_bound=args.opt_goal[2])
+        if args.opt_goal[0] == "min_fast_mem":
+            opt.minimize_fast_mem(communication_volume_bound=args.opt_goal[1])
+        if args.opt_goal[0] == "opt_ratio":
+            opt.optimize_to_ratio(ratio=args.opt_goal[1])
+
     # output a report if argument -report is true
     if args.report:
         chain.report(args.stencil_file)
diff --git a/stencilflow/sdfg_generator.py b/stencilflow/sdfg_generator.py
index 22629b4..87e54e5 100644
--- a/stencilflow/sdfg_generator.py
+++ b/stencilflow/sdfg_generator.py
@@ -28,7 +28,7 @@
 
 import networkx as nx
 
-MINIMUM_CHANNEL_DEPTH = 2048
+MINIMUM_CHANNEL_DEPTH = 1024
 
 NUM_BANKS = 4
 
diff --git a/test/stencils/horidiff_min.json b/test/stencils/horidiff_min.json
new file mode 100644
index 0000000..cf2679b
--- /dev/null
+++ b/test/stencils/horidiff_min.json
@@ -0,0 +1,84 @@
+{
+ "inputs": {
+  "inA": {
+   "data": "inA_float32.dat",
+   "data_type": "float32",
+   "input_dims": [
+    "i"
+   ]
+  }
+ },
+ "outputs": [
+  "out"
+ ],
+ "dimensions": [
+  10,
+  10,
+  10
+ ],
+ "vectorization": 1,
+ "program": {
+  "k0": {
+   "data_type": "float32",
+   "computation_string": "k0 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k1": {
+   "data_type": "float32",
+   "computation_string": "k1 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k2": {
+   "data_type": "float32",
+   "computation_string": "k2 = k1[i, j, k] + k0[i+1, j, k] + k0[i, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "k3": {
+   "data_type": "float32",
+   "computation_string": "k3 = k0[i, j, k] + k1[i+1, j+1, k+1] + k1[i, j, k]",
+   "boundary_conditions": {
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "out": {
+   "data_type": "float32",
+   "computation_string": "out = k2[i, j, k] + k3[i, j, k]",
+   "boundary_conditions": {
+    "k2":{
+       "type": "constant",
+       "value": 0.0
+    },
+    "k3": {
+       "type": "constant",
+       "value": 0.0
+    }
+    }
+   }
+ }
+}
diff --git a/test/stencils/horidiff_min_ext.json b/test/stencils/horidiff_min_ext.json
new file mode 100644
index 0000000..607ff61
--- /dev/null
+++ b/test/stencils/horidiff_min_ext.json
@@ -0,0 +1,94 @@
+{
+ "inputs": {
+  "inA": {
+   "data": "inA_float32.dat",
+   "data_type": "float32",
+   "input_dims": [
+    "i"
+   ]
+  }
+ },
+ "outputs": [
+  "out"
+ ],
+ "dimensions": [
+  10,
+  10,
+  10
+ ],
+ "vectorization": 1,
+ "program": {
+  "k0": {
+   "data_type": "float32",
+   "computation_string": "k0 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k1": {
+   "data_type": "float32",
+   "computation_string": "k1 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k2": {
+   "data_type": "float32",
+   "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "k3": {
+   "data_type": "float32",
+   "computation_string": "k3 = k0[i, j, k] + k4[i+1, j+1, k+1] + k4[i, j, k]",
+   "boundary_conditions": {
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k4": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+   "k4": {
+   "data_type": "float32",
+   "computation_string": "k4 = k1[i, j, k] + k1[i+1, j+1, k+1]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "out": {
+   "data_type": "float32",
+   "computation_string": "out = k2[i,j,k] + k3[i,j,k]",
+   "boundary_conditions": {
+    "k2":{
+       "type": "constant",
+       "value": 0.0
+    },
+    "k3": {
+       "type": "constant",
+       "value": 0.0
+    }
+    }
+   }
+ }
+}
diff --git a/test/stencils/jacobi3d_512x512x512.json b/test/stencils/jacobi3d_512x512x512.json
new file mode 100644
index 0000000..82db32d
--- /dev/null
+++ b/test/stencils/jacobi3d_512x512x512.json
@@ -0,0 +1,24 @@
+{
+    "inputs": {
+        "a": {
+            "data": "data/zeros_32x32x32_fp32.dat",
+            "data_type": "float32"
+        }
+    },
+    "outputs": ["b"],
+    "dimensions": [512, 512, 512],
+    "program": {
+        "b": {
+            "computation_string":
+            "b = 0.16666666 * (a[i-1,j,k] + a[i+1,j,k] + a[i,j-1,k] + a[i,j+1,k] + a[i,j,k-1] + a[i,j,k+1])",
+            "boundary_conditions": {
+                "a": {
+                    "type": "constant",
+                    "value": 1.0
+                }
+            },
+            "data_type":
+            "float32"
+        }
+    }
+}