Merge branch 'master' into gb_cuda_optimize_neighbor_sampler

mfbalin · web-flow · commit c093e6535712 · 2024-03-18T09:41:51.000-04:00
diff --git a/graphbolt/include/graphbolt/continuous_seed.h b/graphbolt/include/graphbolt/continuous_seed.h
@@ -94,6 +94,31 @@ class continuous_seed {
 #endif  // __CUDA_ARCH__
 };
 
+class single_seed {
+  uint64_t seed_;
+
+ public:
+  /* implicit */ single_seed(const int64_t seed) : seed_(seed) {}  // NOLINT
+
+  single_seed(torch::Tensor seed_arr)
+      : seed_(seed_arr.data_ptr<int64_t>()[0]) {}
+
+#ifdef __CUDACC__
+  __device__ inline float uniform(const uint64_t id) const {
+    const uint64_t kCurandSeed = 999961;  // Could be any random number.
+    curandStatePhilox4_32_10_t rng;
+    curand_init(kCurandSeed, seed_, id, &rng);
+    return curand_uniform(&rng);
+  }
+#else
+  inline float uniform(const uint64_t id) const {
+    pcg32 ng0(seed_, id);
+    std::uniform_real_distribution<float> uni;
+    return uni(ng0);
+  }
+#endif  // __CUDA_ARCH__
+};
+
 }  // namespace graphbolt
 
 #endif  // GRAPHBOLT_CONTINUOUS_SEED_H_
diff --git a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
@@ -17,7 +17,11 @@
 namespace graphbolt {
 namespace sampling {
 
-enum SamplerType { NEIGHBOR, LABOR };
+enum SamplerType { NEIGHBOR, LABOR, LABOR_DEPENDENT };
+
+constexpr bool is_labor(SamplerType S) {
+  return S == SamplerType::LABOR || S == SamplerType::LABOR_DEPENDENT;
+}
 
 template <SamplerType S>
 struct SamplerArgs;
@@ -27,6 +31,13 @@ struct SamplerArgs<SamplerType::NEIGHBOR> {};
 
 template <>
 struct SamplerArgs<SamplerType::LABOR> {
+  const torch::Tensor& indices;
+  single_seed random_seed;
+  int64_t num_nodes;
+};
+
+template <>
+struct SamplerArgs<SamplerType::LABOR_DEPENDENT> {
   const torch::Tensor& indices;
   continuous_seed random_seed;
   int64_t num_nodes;
@@ -555,12 +566,12 @@ int64_t Pick(
     const torch::optional<torch::Tensor>& probs_or_mask,
     SamplerArgs<SamplerType::NEIGHBOR> args, PickedType* picked_data_ptr);
 
-template <typename PickedType>
-int64_t Pick(
+template <SamplerType S, typename PickedType>
+std::enable_if_t<is_labor(S), int64_t> Pick(
     int64_t offset, int64_t num_neighbors, int64_t fanout, bool replace,
     const torch::TensorOptions& options,
-    const torch::optional<torch::Tensor>& probs_or_mask,
-    SamplerArgs<SamplerType::LABOR> args, PickedType* picked_data_ptr);
+    const torch::optional<torch::Tensor>& probs_or_mask, SamplerArgs<S> args,
+    PickedType* picked_data_ptr);
 
 template <typename PickedType>
 int64_t TemporalPick(
@@ -619,13 +630,13 @@ int64_t TemporalPickByEtype(
     PickedType* picked_data_ptr);
 
 template <
-    bool NonUniform, bool Replace, typename ProbsType, typename PickedType,
-    int StackSize = 1024>
-int64_t LaborPick(
+    bool NonUniform, bool Replace, typename ProbsType, SamplerType S,
+    typename PickedType, int StackSize = 1024>
+std::enable_if_t<is_labor(S), int64_t> LaborPick(
     int64_t offset, int64_t num_neighbors, int64_t fanout,
     const torch::TensorOptions& options,
-    const torch::optional<torch::Tensor>& probs_or_mask,
-    SamplerArgs<SamplerType::LABOR> args, PickedType* picked_data_ptr);
+    const torch::optional<torch::Tensor>& probs_or_mask, SamplerArgs<S> args,
+    PickedType* picked_data_ptr);
 
 }  // namespace sampling
 }  // namespace graphbolt
diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -15,6 +15,7 @@
 #include <limits>
 #include <numeric>
 #include <tuple>
+#include <type_traits>
 #include <vector>
 
 #include "./macro.h"
@@ -660,26 +661,37 @@ c10::intrusive_ptr<FusedSampledSubgraph> FusedCSCSamplingGraph::SampleNeighbors(
   }
 
   if (layer) {
-    SamplerArgs<SamplerType::LABOR> args = [&] {
-      if (random_seed.has_value()) {
-        return SamplerArgs<SamplerType::LABOR>{
-            indices_,
-            {random_seed.value(), static_cast<float>(seed2_contribution)},
-            NumNodes()};
-      } else {
-        return SamplerArgs<SamplerType::LABOR>{
-            indices_,
-            RandomEngine::ThreadLocal()->RandInt(
-                static_cast<int64_t>(0), std::numeric_limits<int64_t>::max()),
-            NumNodes()};
-      }
-    }();
-    return SampleNeighborsImpl(
-        nodes.value(), return_eids,
-        GetNumPickFn(fanouts, replace, type_per_edge_, probs_or_mask),
-        GetPickFn(
-            fanouts, replace, indptr_.options(), type_per_edge_, probs_or_mask,
-            args));
+    if (random_seed.has_value() && random_seed->numel() >= 2) {
+      SamplerArgs<SamplerType::LABOR_DEPENDENT> args{
+          indices_,
+          {random_seed.value(), static_cast<float>(seed2_contribution)},
+          NumNodes()};
+      return SampleNeighborsImpl(
+          nodes.value(), return_eids,
+          GetNumPickFn(fanouts, replace, type_per_edge_, probs_or_mask),
+          GetPickFn(
+              fanouts, replace, indptr_.options(), type_per_edge_,
+              probs_or_mask, args));
+    } else {
+      auto args = [&] {
+        if (random_seed.has_value() && random_seed->numel() == 1) {
+          return SamplerArgs<SamplerType::LABOR>{
+              indices_, random_seed.value(), NumNodes()};
+        } else {
+          return SamplerArgs<SamplerType::LABOR>{
+              indices_,
+              RandomEngine::ThreadLocal()->RandInt(
+                  static_cast<int64_t>(0), std::numeric_limits<int64_t>::max()),
+              NumNodes()};
+        }
+      }();
+      return SampleNeighborsImpl(
+          nodes.value(), return_eids,
+          GetNumPickFn(fanouts, replace, type_per_edge_, probs_or_mask),
+          GetPickFn(
+              fanouts, replace, indptr_.options(), type_per_edge_,
+              probs_or_mask, args));
+    }
   } else {
     SamplerArgs<SamplerType::NEIGHBOR> args;
     return SampleNeighborsImpl(
@@ -1297,7 +1309,7 @@ int64_t TemporalPick(
     }
     return picked_indices.numel();
   }
-  if constexpr (S == SamplerType::LABOR) {
+  if constexpr (is_labor(S)) {
     return Pick(
         offset, num_neighbors, fanout, replace, options, masked_prob, args,
         picked_data_ptr);
@@ -1383,12 +1395,12 @@ int64_t TemporalPickByEtype(
   return pick_offset;
 }
 
-template <typename PickedType>
-int64_t Pick(
+template <SamplerType S, typename PickedType>
+std::enable_if_t<is_labor(S), int64_t> Pick(
     int64_t offset, int64_t num_neighbors, int64_t fanout, bool replace,
     const torch::TensorOptions& options,
-    const torch::optional<torch::Tensor>& probs_or_mask,
-    SamplerArgs<SamplerType::LABOR> args, PickedType* picked_data_ptr) {
+    const torch::optional<torch::Tensor>& probs_or_mask, SamplerArgs<S> args,
+    PickedType* picked_data_ptr) {
   if (fanout == 0) return 0;
   if (probs_or_mask.has_value()) {
     if (fanout < 0) {
@@ -1438,9 +1450,9 @@ inline T invcdf(T u, int64_t n, T rem) {
   return rem * (one - std::pow(one - u, one / n));
 }
 
-template <typename T>
+template <typename T, typename seed_t>
 inline T jth_sorted_uniform_random(
-    continuous_seed seed, int64_t t, int64_t c, int64_t j, T& rem, int64_t n) {
+    seed_t seed, int64_t t, int64_t c, int64_t j, T& rem, int64_t n) {
   const T u = seed.uniform(t + j * c);
   // https://mathematica.stackexchange.com/a/256707
   rem -= invcdf(u, n, rem);
@@ -1474,13 +1486,13 @@ inline T jth_sorted_uniform_random(
  * should be put. Enough memory space should be allocated in advance.
  */
 template <
-    bool NonUniform, bool Replace, typename ProbsType, typename PickedType,
-    int StackSize>
-inline int64_t LaborPick(
+    bool NonUniform, bool Replace, typename ProbsType, SamplerType S,
+    typename PickedType, int StackSize>
+inline std::enable_if_t<is_labor(S), int64_t> LaborPick(
     int64_t offset, int64_t num_neighbors, int64_t fanout,
     const torch::TensorOptions& options,
-    const torch::optional<torch::Tensor>& probs_or_mask,
-    SamplerArgs<SamplerType::LABOR> args, PickedType* picked_data_ptr) {
+    const torch::optional<torch::Tensor>& probs_or_mask, SamplerArgs<S> args,
+    PickedType* picked_data_ptr) {
   fanout = Replace ? fanout : std::min(fanout, num_neighbors);
   if (!NonUniform && !Replace && fanout >= num_neighbors) {
     std::iota(picked_data_ptr, picked_data_ptr + num_neighbors, offset);
@@ -1504,8 +1516,8 @@ inline int64_t LaborPick(
   }
   AT_DISPATCH_INDEX_TYPES(
       args.indices.scalar_type(), "LaborPickMain", ([&] {
-        const index_t* local_indices_data =
-            args.indices.data_ptr<index_t>() + offset;
+        const auto local_indices_data =
+            reinterpret_cast<index_t*>(args.indices.data_ptr()) + offset;
         if constexpr (Replace) {
           // [Algorithm] @mfbalin
           // Use a max-heap to get rid of the big random numbers and filter the
diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -791,6 +791,32 @@ def sample_layer_neighbors(
             corresponding to each neighboring edge of a node. It must be a 1D
             floating-point or boolean tensor, with the number of elements
             equalling the total number of edges.
+        random_seed: torch.Tensor, optional
+            An int64 tensor with one or two elements.
+
+            The passed random_seed makes it so that for any seed node ``s`` and
+            its neighbor ``t``, the rolled random variate ``r_t`` is the same
+            for any call to this function with the same random seed. When
+            sampling as part of the same batch, one would want identical seeds
+            so that LABOR can globally sample. One example is that for
+            heterogenous graphs, there is a single random seed passed for each
+            edge type. This will sample much fewer nodes compared to having
+            unique random seeds for each edge type. If one called this function
+            individually for each edge type for a heterogenous graph with
+            different random seeds, then it would run LABOR locally for each
+            edge type, resulting into a larger number of nodes being sampled.
+
+            If this function is called without a ``random_seed``, we get the
+            random seed by getting a random number from GraphBolt. Use this
+            argument with identical random_seed if multiple calls to this
+            function are used to sample as part of a single batch.
+
+            If given two numbers, then the ``seed2_contribution`` argument
+            determines the interpolation between the two random seeds.
+        seed2_contribution: float, optional
+            A float value between [0, 1) that determines the contribution of the
+            second random seed, ``random_seed[-1]``, to generate the random
+            variates.
 
         Returns
         -------
@@ -826,6 +852,14 @@ def sample_layer_neighbors(
             nodes = self._convert_to_homogeneous_nodes(nodes)
 
         self._check_sampler_arguments(nodes, fanouts, probs_name)
+        if random_seed is not None:
+            assert (
+                1 <= len(random_seed) <= 2
+            ), "There should be a 1 or 2 random seeds."
+            if len(random_seed) == 2:
+                assert (
+                    0 <= seed2_contribution <= 1
+                ), "seed2_contribution should be in [0, 1]."
         has_original_eids = (
             self.edge_attributes is not None
             and ORIGINAL_EDGE_ID in self.edge_attributes
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -483,7 +483,7 @@ class LayerNeighborSampler(NeighborSamplerImpl):
     Sampler that builds computational dependency of node representations via
     labor sampling for multilayer GNN from the NeurIPS 2023 paper
     `Layer-Neighbor Sampling -- Defusing Neighborhood Explosion in GNNs
-    <https://arxiv.org/abs/2210.13339>`__
+    <https://proceedings.neurips.cc/paper_files/paper/2023/file/51f9036d5e7ae822da8f6d4adda1fb39-Paper-Conference.pdf>`__
 
     Layer-Neighbor sampler is responsible for sampling a subgraph from given
     data. It returns an induced subgraph along with compacted information. In
@@ -526,6 +526,19 @@ class LayerNeighborSampler(NeighborSamplerImpl):
         Boolean indicating whether seeds between hops will be deduplicated.
         If True, the same elements in seeds will be deleted to only one.
         Otherwise, the same elements will be remained.
+    layer_dependency: bool
+        Boolean indicating whether different layers should use the same random
+        variates. Results in a reduction in the number of nodes sampled and
+        turns LayerNeighborSampler into a subgraph sampling method. Later layers
+        will be guaranteed to sample overlapping neighbors as the previous
+        layers.
+    batch_dependency: int
+        Specifies whether consecutive minibatches should use similar random
+        variates. Results in a higher temporal access locality of sampled
+        nodes and edges. Setting it to :math:`\\kappa` slows down the change in
+        the random variates proportional to :math:`\frac{1}{\\kappa}`. Implements
+        the dependent minibatching approach in `arXiv:2310.12403
+        <https://arxiv.org/abs/2310.12403>__.
 
     Examples
     -------
diff --git a/python/dgl/graphbolt/minibatch.py b/python/dgl/graphbolt/minibatch.py
@@ -536,6 +536,11 @@ def to_pyg_data(self):
                 batch_size = len(next(iter(self.node_pairs.values()))[0])
             else:
                 batch_size = len(self.node_pairs[0])
+        elif self.seeds is not None:
+            if isinstance(self.seeds, Dict):
+                batch_size = len(next(iter(self.seeds.values())))
+            else:
+                batch_size = len(self.seeds)
         else:
             batch_size = None
         pyg_data = Data(
diff --git a/python/setup.py b/python/setup.py
@@ -227,6 +227,7 @@ def get_lib_file_path(lib_name, backend=""):
     "tqdm",
     "psutil>=5.8.0",
     "torchdata>=0.5.0",
+    "pandas",
 ]
 if "DGLBACKEND" in os.environ and os.environ["DGLBACKEND"] != "pytorch":
     install_requires.pop(install_requires.index("torchdata>=0.5.0"))
diff --git a/tests/python/pytorch/graphbolt/test_minibatch.py b/tests/python/pytorch/graphbolt/test_minibatch.py

Original file line number	Diff line number	Diff line change
`@@ -227,6 +227,7 @@ def get_lib_file_path(lib_name, backend=""):`
`227`	`227`	`"tqdm",`
`228`	`228`	`"psutil>=5.8.0",`
`229`	`229`	`"torchdata>=0.5.0",`
	`230`	`+ "pandas",`
`230`	`231`	`]`
`231`	`232`	`if "DGLBACKEND" in os.environ and os.environ["DGLBACKEND"] != "pytorch":`
`232`	`233`	`install_requires.pop(install_requires.index("torchdata>=0.5.0"))`
-Original file line number
+Diff line change
+            )
 -def test_to_pyg_data():
 +def test_to_pyg_data_original():
     test_minibatch = create_homo_minibatch()
     test_minibatch.seed_nodes = torch.tensor([0, 1])
     test_minibatch.labels = torch.tensor([7, 8])
     try:
         pyg_data = test_minibatch.to_pyg_data()
         assert (
 -            pyg_data.x is None,
 +            pyg_data.x is None
 +        ), "Multiple features case should raise an error."
 +    except AssertionError as e:
 +        assert (
 +            str(e)
 +            == "`to_pyg_data` only supports single feature homogeneous graph."
 +        )
++
++
 +def test_to_pyg_data():
 +    test_minibatch = create_homo_minibatch()
 +    test_minibatch.seeds = torch.tensor([0, 1])
 +    test_minibatch.labels = torch.tensor([7, 8])
++
 +    expected_edge_index = torch.tensor(
 +        [[0, 0, 1, 1, 1, 2, 2, 2, 2], [0, 1, 0, 1, 2, 0, 1, 2, 3]]
 +    )
 +    expected_node_features = next(iter(test_minibatch.node_features.values()))
 +    expected_labels = torch.tensor([7, 8])
 +    expected_batch_size = 2
 +    expected_n_id = torch.tensor([10, 11, 12, 13])
++
 +    pyg_data = test_minibatch.to_pyg_data()
 +    pyg_data.validate()
 +    assert torch.equal(pyg_data.edge_index, expected_edge_index)
 +    assert torch.equal(pyg_data.x, expected_node_features)
 +    assert torch.equal(pyg_data.y, expected_labels)
 +    assert pyg_data.batch_size == expected_batch_size
 +    assert torch.equal(pyg_data.n_id, expected_n_id)
++
 +    test_minibatch.seeds = torch.tensor([[0, 1], [2, 3]])
 +    assert pyg_data.batch_size == expected_batch_size
++
 +    test_minibatch.seeds = {"A": torch.tensor([0, 1])}
 +    assert pyg_data.batch_size == expected_batch_size
++
 +    test_minibatch.seeds = {"A": torch.tensor([[0, 1], [2, 3]])}
 +    assert pyg_data.batch_size == expected_batch_size
++
 +    subgraph = test_minibatch.sampled_subgraphs[0]
 +    # Test with sampled_csc as None.
 +    test_minibatch = gb.MiniBatch(
 +        sampled_subgraphs=None,
 +        node_features={"feat": expected_node_features},
 +        labels=expected_labels,
 +    )
 +    pyg_data = test_minibatch.to_pyg_data()
 +    assert pyg_data.edge_index is None, "Edge index should be none."
++
 +    # Test with node_features as None.
 +    test_minibatch = gb.MiniBatch(
 +        sampled_subgraphs=[subgraph],
 +        node_features=None,
 +        labels=expected_labels,
 +    )
 +    pyg_data = test_minibatch.to_pyg_data()
 +    assert pyg_data.x is None, "Node features should be None."
++
 +    # Test with labels as None.
 +    test_minibatch = gb.MiniBatch(
 +        sampled_subgraphs=[subgraph],
 +        node_features={"feat": expected_node_features},
 +        labels=None,
 +    )
 +    pyg_data = test_minibatch.to_pyg_data()
 +    assert pyg_data.y is None, "Labels should be None."
++
 +    # Test with multiple features.
 +    test_minibatch = gb.MiniBatch(
 +        sampled_subgraphs=[subgraph],
 +        node_features={
 +            "feat": expected_node_features,
 +            "extra_feat": torch.tensor([[3], [4]]),
 +        },
 +        labels=expected_labels,
 +    )
 +    try:
 +        pyg_data = test_minibatch.to_pyg_data()
 +        assert (
 +            pyg_data.x is None
         ), "Multiple features case should raise an error."
     except AssertionError as e:
         assert (