23
23
#include < cuda_runtime.h>
24
24
#include < dgl/immutable_graph.h>
25
25
#include < dgl/runtime/device_api.h>
26
+ #include < dgl/runtime/tensordispatch.h>
26
27
27
28
#include < algorithm>
28
29
#include < memory>
36
37
using namespace dgl ::aten;
37
38
using namespace dgl ::runtime::cuda;
38
39
using namespace dgl ::transform::cuda;
40
+ using TensorDispatcher = dgl::runtime::TensorDispatcher;
39
41
40
42
namespace dgl {
41
43
namespace transform {
@@ -165,6 +167,9 @@ struct CUDAIdsMapper {
165
167
NewIdArray (maxNodesPerType[ntype], ctx, sizeof (IdType) * 8 ));
166
168
}
167
169
}
170
+
171
+ cudaEvent_t copyEvent;
172
+ NDArray new_len_tensor;
168
173
// Populate the mappings.
169
174
if (generate_lhs_nodes) {
170
175
int64_t * count_lhs_device = static_cast <int64_t *>(
@@ -174,13 +179,23 @@ struct CUDAIdsMapper {
174
179
src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
175
180
stream);
176
181
177
- device->CopyDataFromTo (
178
- count_lhs_device, 0 , num_nodes_per_type.data (), 0 ,
179
- sizeof (*num_nodes_per_type.data ()) * num_ntypes, ctx,
180
- DGLContext{kDGLCPU , 0 }, DGLDataType{kDGLInt , 64 , 1 });
181
- device->StreamSync (ctx, stream);
182
+ CUDA_CALL (cudaEventCreate (©Event));
183
+ if (TensorDispatcher::Global ()->IsAvailable ()) {
184
+ new_len_tensor = NDArray::PinnedEmpty (
185
+ {num_ntypes}, DGLDataTypeTraits<int64_t >::dtype,
186
+ DGLContext{kDGLCPU , 0 });
187
+ } else {
188
+ // use pageable memory, it will unecessarily block but be functional
189
+ new_len_tensor = NDArray::Empty (
190
+ {num_ntypes}, DGLDataTypeTraits<int64_t >::dtype,
191
+ DGLContext{kDGLCPU , 0 });
192
+ }
193
+ CUDA_CALL (cudaMemcpyAsync (
194
+ new_len_tensor->data , count_lhs_device,
195
+ sizeof (*num_nodes_per_type.data ()) * num_ntypes,
196
+ cudaMemcpyDeviceToHost, stream));
197
+ CUDA_CALL (cudaEventRecord (copyEvent, stream));
182
198
183
- // Wait for the node counts to finish transferring.
184
199
device->FreeWorkspace (ctx, count_lhs_device);
185
200
} else {
186
201
maker.Make (lhs_nodes, rhs_nodes, &node_maps, stream);
@@ -189,14 +204,23 @@ struct CUDAIdsMapper {
189
204
num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape [0 ];
190
205
}
191
206
}
192
- // Resize lhs nodes.
207
+ // Map node numberings from global to local, and build pointer for CSR.
208
+ auto ret = MapEdges (graph, edge_arrays, node_maps, stream);
209
+
193
210
if (generate_lhs_nodes) {
211
+ // wait for the previous copy
212
+ CUDA_CALL (cudaEventSynchronize (copyEvent));
213
+ CUDA_CALL (cudaEventDestroy (copyEvent));
214
+
215
+ // Resize lhs nodes.
194
216
for (int64_t ntype = 0 ; ntype < num_ntypes; ++ntype) {
217
+ num_nodes_per_type[ntype] =
218
+ static_cast <int64_t *>(new_len_tensor->data )[ntype];
195
219
lhs_nodes[ntype]->shape [0 ] = num_nodes_per_type[ntype];
196
220
}
197
221
}
198
- // Map node numberings from global to local, and build pointer for CSR.
199
- return MapEdges (graph, edge_arrays, node_maps, stream) ;
222
+
223
+ return ret ;
200
224
}
201
225
};
202
226
0 commit comments