Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Dataset] CornellDataset and TexasDataset #5513

Merged
merged 5 commits into from
Apr 3, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/api/python/dgl.data.rst
Original file line number Diff line number Diff line change
@@ -59,6 +59,8 @@ Datasets for node classification/regression tasks
ChameleonDataset
SquirrelDataset
ActorDataset
CornellDataset
TexasDataset

Edge Prediction Datasets
---------------------------------------
7 changes: 6 additions & 1 deletion python/dgl/data/__init__.py
Original file line number Diff line number Diff line change
@@ -54,8 +54,13 @@
from .tu import LegacyTUDataset, TUDataset
from .utils import *
from .cluster import CLUSTERDataset
from .geom_gcn import (
ChameleonDataset,
CornellDataset,
SquirrelDataset,
TexasDataset,
)
from .pattern import PATTERNDataset
from .wiki_network import ChameleonDataset, SquirrelDataset
from .wikics import WikiCSDataset
from .yelp import YelpDataset
from .zinc import ZINCDataset
173 changes: 161 additions & 12 deletions python/dgl/data/wiki_network.py → python/dgl/data/geom_gcn.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
"""
Wikipedia page-page networks on two topics: chameleons and squirrels.
"""
"""Datasets introduced in the Geom-GCN paper."""
import os

import numpy as np
@@ -10,11 +8,10 @@
from .utils import _get_dgl_url


class WikiNetworkDataset(DGLBuiltinDataset):
r"""Wikipedia page-page networks from `Multi-scale Attributed
Node Embedding <https://arxiv.org/abs/1909.13021>`__ and later modified by
class GeomGCNDataset(DGLBuiltinDataset):
r"""Datasets introduced in
`Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>`
<https://arxiv.org/abs/2002.05287>`__

Parameters
----------
@@ -34,7 +31,7 @@ class WikiNetworkDataset(DGLBuiltinDataset):

def __init__(self, name, raw_dir, force_reload, verbose, transform):
url = _get_dgl_url(f"dataset/{name}.zip")
super(WikiNetworkDataset, self).__init__(
super(GeomGCNDataset, self).__init__(
name=name,
url=url,
raw_dir=raw_dir,
@@ -106,11 +103,11 @@ def num_classes(self):
return self._num_classes


class ChameleonDataset(WikiNetworkDataset):
class ChameleonDataset(GeomGCNDataset):
r"""Wikipedia page-page network on chameleons from `Multi-scale Attributed
Node Embedding <https://arxiv.org/abs/1909.13021>`__ and later modified by
`Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>`
<https://arxiv.org/abs/2002.05287>`__

Nodes represent articles from the English Wikipedia, edges reflect mutual
links between them. Node features indicate the presence of particular nouns
@@ -182,11 +179,11 @@ def __init__(
)


class SquirrelDataset(WikiNetworkDataset):
class SquirrelDataset(GeomGCNDataset):
r"""Wikipedia page-page network on squirrels from `Multi-scale Attributed
Node Embedding <https://arxiv.org/abs/1909.13021>`__ and later modified by
`Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>`
<https://arxiv.org/abs/2002.05287>`__

Nodes represent articles from the English Wikipedia, edges reflect mutual
links between them. Node features indicate the presence of particular nouns
@@ -256,3 +253,155 @@ def __init__(
verbose=verbose,
transform=transform,
)


class CornellDataset(GeomGCNDataset):
r"""Cornell subset of
`WebKB <http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/>`__,
later modified by `Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>`__

Nodes represent web pages. Edges represent hyperlinks between them. Node
features are the bag-of-words representation of web pages. The web pages
are manually classified into the five categories, student, project, course,
staff, and faculty.

Statistics:

- Nodes: 183
- Edges: 298
- Number of Classes: 5
- 10 train/val/test splits

- Train: 87
- Val: 59
- Test: 37

Parameters
----------
raw_dir : str, optional
Raw file directory to store the processed data. Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download the data source. Default: False
verbose : bool, optional
Whether to print progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access. Default: None

Attributes
----------
num_classes : int
Number of node classes

Notes
-----
The graph does not come with edges for both directions.

Examples
--------

>>> from dgl.data import CornellDataset
>>> dataset = CornellDataset()
>>> g = dataset[0]
>>> num_classes = dataset.num_classes

>>> # get node features
>>> feat = g.ndata["feat"]

>>> # get data split
>>> train_mask = g.ndata["train_mask"]
>>> val_mask = g.ndata["val_mask"]
>>> test_mask = g.ndata["test_mask"]

>>> # get labels
>>> label = g.ndata['label']
"""

def __init__(
self, raw_dir=None, force_reload=False, verbose=True, transform=None
):
super(CornellDataset, self).__init__(
name="cornell",
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)


class TexasDataset(GeomGCNDataset):
r"""Texas subset of
`WebKB <http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/>`__,
later modified by `Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>`__

Nodes represent web pages. Edges represent hyperlinks between them. Node
features are the bag-of-words representation of web pages. The web pages
are manually classified into the five categories, student, project, course,
staff, and faculty.

Statistics:

- Nodes: 183
- Edges: 325
- Number of Classes: 5
- 10 train/val/test splits

- Train: 87
- Val: 59
- Test: 37

Parameters
----------
raw_dir : str, optional
Raw file directory to store the processed data. Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download the data source. Default: False
verbose : bool, optional
Whether to print progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access. Default: None

Attributes
----------
num_classes : int
Number of node classes

Notes
-----
The graph does not come with edges for both directions.

Examples
--------

>>> from dgl.data import TexasDataset
>>> dataset = TexasDataset()
>>> g = dataset[0]
>>> num_classes = dataset.num_classes

>>> # get node features
>>> feat = g.ndata["feat"]

>>> # get data split
>>> train_mask = g.ndata["train_mask"]
>>> val_mask = g.ndata["val_mask"]
>>> test_mask = g.ndata["test_mask"]

>>> # get labels
>>> label = g.ndata['label']
"""

def __init__(
self, raw_dir=None, force_reload=False, verbose=True, transform=None
):
super(TexasDataset, self).__init__(
name="texas",
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
Original file line number Diff line number Diff line change
@@ -37,3 +37,37 @@ def test_squirrel():
assert g.num_edges() == 217073
g2 = dgl.data.SquirrelDataset(force_reload=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(
dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
)
def test_cornell():
transform = dgl.AddSelfLoop(allow_duplicate=True)

g = dgl.data.CornellDataset(force_reload=True)[0]
assert g.num_nodes() == 183
assert g.num_edges() == 298
g2 = dgl.data.CornellDataset(force_reload=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()


@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(
dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
)
def test_texas():
transform = dgl.AddSelfLoop(allow_duplicate=True)

g = dgl.data.TexasDataset(force_reload=True)[0]
assert g.num_nodes() == 183
assert g.num_edges() == 325
g2 = dgl.data.TexasDataset(force_reload=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()