Module `graphlearning.datasets`

Datasets

This module allows for loading standard datasets (currently mnist, fashionmnist, cifar10, signmnist), and creating and saving new datasets by name locally.

Expand source code

"""
Datasets
==========

This module allows for loading standard datasets (currently mnist, fashionmnist, cifar10, signmnist), and creating
and saving new datasets by name locally.
"""

import numpy as np
import ssl
import os
import matplotlib.pyplot as plt
from . import utils
from . import graph

#Directory for storing datasets
data_dir = os.path.abspath(os.path.join(os.getcwd(),'data'))

def two_skies(n,sigma=0.15,sep=0.64):
    """Two skies dataset
    ======

    Random sample from the two skies dataset.

    Returns
    -------
    n : int
        Number of data points (should be even).
    sigma : float (optional)
        Standard deviation of the skies.
    sep : float (optional)
        Separation between the two skies.
        

    Returns
    -------
    data : numpy array, float
        (n,2) numpy array of data.
    labels : numpy array, int
        Binary labels indicating two skies.

    """

    m = int(n/2)
    y1 = sigma*np.random.randn(m,1) + sep/2
    y2 = sigma*np.random.randn(m,1) - sep/2
    y = np.vstack((y1,y2))
    x = np.random.rand(2*m,1)
    labels = np.vstack((np.zeros(m),np.ones(m)))
    data = np.hstack((x,y))
    return data,labels


def save(data, labels, dataset, metric='raw', overwrite=False):
    """Save dataset
    ======

    Add a new dataset to graph learning by saving the data and labels.
   
    Parameters
    ----------
    data : (n,m) numpy array, float
        n data points in m dimensions.
    labels : Length n numpy array, int
        Integer values for labels. 
    dataset : string
        Name of dataset.
    metric : string (optional), default='raw'
        A modifier to add to the dataset name when saving, to distinguish 
        different types of knn data (not case-sensitive).
    overwrite : bool (optional), default=False
        Whether to overwrite if dataset already exists.
    """

    #Dataset filename
    dataFile = dataset.lower()+"_"+metric.lower()+".npz"
    labelsFile = dataset.lower()+"_labels.npz"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)
    labelsFile_path = os.path.join(data_dir, labelsFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Save dataset and labels
    if os.path.isfile(dataFile_path) and not overwrite:
        print('Data file '+dataFile_path+' already exists. Not saving.')
    else:
        np.savez_compressed(dataFile_path,data=data)
        np.savez_compressed(labelsFile_path,labels=labels)


def load(dataset, metric='raw', labels_only=False):
    """Load dataset
    ======

    Load a dataset. Currently implemented datasets include

    1. [mnist](http://yann.lecun.com/exdb/mnist/): metrics are 'raw' and 'vae' (variational autoencoder)
    2. [fashionmnist](https://github.com/zalandoresearch/fashion-mnist): metrics are 'raw' and 'vae' 
    3. [cifar10](https://www.cs.toronto.edu/~kriz/cifar.html): metrics are 'raw' and 'simclr'. Loads CIFAR-10.
    4. [yalefaces](https://paperswithcode.com/dataset/extended-yale-b-1): Only metric is 'raw'.
    5. [signmnist](https://www.kaggle.com/datasets/datamunge/sign-language-mnist): Sign language version of MNIST.
   
    Parameters
    ----------
    dataset : string, {'mnist', 'fashionmnist', 'cifar'}
        Name of dataset.
    metric : string (optional), default='raw'
        Indicates the embedding method used in the graph construction. For example, dataset='mnist' with
        metric='vae' loads the latent features from a variational autoencoder trained on MNIST.
    labels_only : bool (optional), default=False
        Whether to return only the labels. Useful if the dataset is very large and knndata is already 
        precomputed, so the raw features are not needed.
    
    Returns
    -------
    data : numpy array, float
        (n,d) numpy array of n datapoints in dimension d. Not returned if `labels_only=True`.
    labels : numpy array, int
        Integer-valued labels in range 0 through k-1, where k is the number of classes.
    """

    #Dataset filename
    dataFile = dataset.lower()+"_"+metric.lower()+".npz"
    labelsFile = dataset.lower()+"_labels.npz"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)
    labelsFile_path = os.path.join(data_dir, labelsFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download labels file if needed
    if not os.path.exists(labelsFile_path):
        urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/Data/'+labelsFile
        utils.download_file(urlpath, labelsFile_path)

    #Load labels from npz file
    labels = utils.numpy_load(labelsFile_path, 'labels')

    if labels_only:
        return labels
    else:
        #Download dataset file if needed
        if not os.path.exists(dataFile_path):
            urlpath = 'http://www-users.math.umn.edu/~jwcalder/Data/'+dataFile
            utils.download_file(urlpath, dataFile_path)

        data = utils.numpy_load(dataFile_path, 'data')
        return data, labels

def load_graph(name):
    """Load graph
    ======

    Load a graph. Currently implemented graphs include

    1. [karate](https://en.wikipedia.org/wiki/Zachary's_karate_club): Zachary's karate club [1]
    2. [cora](https://proceedings.mlr.press/v48/yanga16): Cora citation graph [2]
    3. [citeseer](https://proceedings.mlr.press/v48/yanga16): CiteSeer citation graph [2]
    4. [pubmed](https://proceedings.mlr.press/v48/yanga16): PubMed citation graph [2]
    5. [webkb_cornell](https://openreview.net/forum?id=S1e2agrFvS): WebKB Cornell graph [4]
    6. [webkb_texas](https://openreview.net/forum?id=S1e2agrFvS): WebKB Texas graph [4]
    7. [webkb_wisconsin](https://openreview.net/forum?id=S1e2agrFvS): WebKB Wisconsin graph [4]
    8. [nell](https://proceedings.mlr.press/v48/yanga16): The NELL knowledge graph [2,3]
    9. [wikics](https://arxiv.org/abs/2007.02901): The Wiki-CS graph [5]
    10. [airports_usa](https://arxiv.org/abs/1704.03165): The USA airports graph [6,7]
    11. [airports_brazil](https://arxiv.org/abs/1704.03165): The Brazil airports graph [6,7]
    12. [airports_europe](https://arxiv.org/abs/1704.03165): The Europe airports graph [6,7]
    13. [polbooks](https://www.pnas.org/doi/full/10.1073/pnas.0601602103): The Political books graph [8]

    [1] Zachary, W. W. (1977). "An Information Flow Model for Conflict and Fission in Small Groups". Journal of Anthropological Research. 33 (4): 452–473

    [2] Yang, Zhilin, William Cohen, and Ruslan Salakhudinov. "Revisiting semi-supervised learning with graph embeddings." International conference on machine learning. PMLR, 2016.

    [3] Carlson, Andrew, Justin Betteridge, Bryan Kisiel, Burr Settles, Estevam Hruschka, and Tom Mitchell. "Toward an architecture for never-ending language learning." In Proceedings of the AAAI conference on artificial intelligence, vol. 24, no. 1, pp. 1306-1313. 2010.

    [4] Pei, Hongbin, Bingzhe Wei, Kevin Chen-Chuan Chang, Yu Lei, and Bo Yang. "Geom-GCN: Geometric Graph Convolutional Networks." In International Conference on Learning Representations. 2019.

    [5] Mernyei, Péter, and Cătălina Cangea. "Wiki-cs: A wikipedia-based benchmark for graph neural networks." arXiv preprint arXiv:2007.02901 (2020).

    [6] Figueiredo, D.R., Ribeiro, L.F.R. and Saverese, P.H., 2017. struc2vec: Learning node representations from structural identity. CoRR, vol. abs/1704.03165.

    [7] Jin, Y., Song, G. and Shi, C., 2020, April. GraLSP: Graph neural networks with local structural patterns. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 34, No. 04, pp. 4361-4368).
  
    [8] Newman, M.E., 2006. Modularity and community structure in networks. Proceedings of the national academy of sciences, 103(23), pp.8577-8582.

    Parameters
    ----------
    name : string, {'karate','cora','citeseer','pubmed','webkb_cornell','webkb_texas','webkb_wisconsin','nell','wikics'}
        Name of dataset.
    
    Returns
    -------
    G : graphlearning graph object
        Graph object with weight matrix and labels/features if available
    """

    #Dataset filename
    dataFile = name.lower()+".pkl"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download dataset file if needed
    if not os.path.exists(dataFile_path):
        urlpath = 'http://www-users.math.umn.edu/~jwcalder/Data/'+dataFile
        utils.download_file(urlpath, dataFile_path)

    return graph.graph.load(dataFile_path[:-4])

def load_image(name):
    """Load image 
    ======

    Load an image. 

    Parameters
    ----------
    name : string
        Name of image, choices are {'cameraman', 'cow', 'house', 'jetplane', 'lake', 'mandril_color', 'mandril_gray', 'peppers_color', 'peppers_gray', 'pirate', 'walkbridge', 'chairtoy', 'chairtoy_highres','chairtoy_bw', 'chairtoy_highres_bw'}
    Returns
    -------
    image : numpy array, float
        (m,n) or (m,n,3) numpy array containing image.
    """


    #Dataset filename
    dataFile = name.lower()+'.png'

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download image file if needed
    if not os.path.exists(dataFile_path):
        urlpath = 'http://www-users.math.umn.edu/~jwcalder/TestImages/'+dataFile
        utils.download_file(urlpath, dataFile_path)

    #Load image
    image = plt.imread(dataFile_path)

    return image

Functions

def load(dataset, metric='raw', labels_only=False)

Load dataset

Load a dataset. Currently implemented datasets include

mnist: metrics are 'raw' and 'vae' (variational autoencoder)
fashionmnist: metrics are 'raw' and 'vae'
cifar10: metrics are 'raw' and 'simclr'. Loads CIFAR-10.
yalefaces: Only metric is 'raw'.
signmnist: Sign language version of MNIST.

Parameters

dataset : string, {'mnist', 'fashionmnist', 'cifar'}: Name of dataset.
metric : string (optional), default='raw': Indicates the embedding method used in the graph construction. For example, dataset='mnist' with metric='vae' loads the latent features from a variational autoencoder trained on MNIST.
labels_only : bool (optional), default=False: Whether to return only the labels. Useful if the dataset is very large and knndata is already precomputed, so the raw features are not needed.

Returns

data : numpy array, float: (n,d) numpy array of n datapoints in dimension d. Not returned if labels_only=True.
labels : numpy array, int: Integer-valued labels in range 0 through k-1, where k is the number of classes.

Expand source code

def load(dataset, metric='raw', labels_only=False):
    """Load dataset
    ======

    Load a dataset. Currently implemented datasets include

    1. [mnist](http://yann.lecun.com/exdb/mnist/): metrics are 'raw' and 'vae' (variational autoencoder)
    2. [fashionmnist](https://github.com/zalandoresearch/fashion-mnist): metrics are 'raw' and 'vae' 
    3. [cifar10](https://www.cs.toronto.edu/~kriz/cifar.html): metrics are 'raw' and 'simclr'. Loads CIFAR-10.
    4. [yalefaces](https://paperswithcode.com/dataset/extended-yale-b-1): Only metric is 'raw'.
    5. [signmnist](https://www.kaggle.com/datasets/datamunge/sign-language-mnist): Sign language version of MNIST.
   
    Parameters
    ----------
    dataset : string, {'mnist', 'fashionmnist', 'cifar'}
        Name of dataset.
    metric : string (optional), default='raw'
        Indicates the embedding method used in the graph construction. For example, dataset='mnist' with
        metric='vae' loads the latent features from a variational autoencoder trained on MNIST.
    labels_only : bool (optional), default=False
        Whether to return only the labels. Useful if the dataset is very large and knndata is already 
        precomputed, so the raw features are not needed.
    
    Returns
    -------
    data : numpy array, float
        (n,d) numpy array of n datapoints in dimension d. Not returned if `labels_only=True`.
    labels : numpy array, int
        Integer-valued labels in range 0 through k-1, where k is the number of classes.
    """

    #Dataset filename
    dataFile = dataset.lower()+"_"+metric.lower()+".npz"
    labelsFile = dataset.lower()+"_labels.npz"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)
    labelsFile_path = os.path.join(data_dir, labelsFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download labels file if needed
    if not os.path.exists(labelsFile_path):
        urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/Data/'+labelsFile
        utils.download_file(urlpath, labelsFile_path)

    #Load labels from npz file
    labels = utils.numpy_load(labelsFile_path, 'labels')

    if labels_only:
        return labels
    else:
        #Download dataset file if needed
        if not os.path.exists(dataFile_path):
            urlpath = 'http://www-users.math.umn.edu/~jwcalder/Data/'+dataFile
            utils.download_file(urlpath, dataFile_path)

        data = utils.numpy_load(dataFile_path, 'data')
        return data, labels

def load_graph(name)

Load graph

Load a graph. Currently implemented graphs include

karate: Zachary's karate club [1]
cora: Cora citation graph [2]
citeseer: CiteSeer citation graph [2]
pubmed: PubMed citation graph [2]
webkb_cornell: WebKB Cornell graph [4]
webkb_texas: WebKB Texas graph [4]
webkb_wisconsin: WebKB Wisconsin graph [4]
nell: The NELL knowledge graph [2,3]
wikics: The Wiki-CS graph [5]
airports_usa: The USA airports graph [6,7]
airports_brazil: The Brazil airports graph [6,7]
airports_europe: The Europe airports graph [6,7]
polbooks: The Political books graph [8]

[1] Zachary, W. W. (1977). "An Information Flow Model for Conflict and Fission in Small Groups". Journal of Anthropological Research. 33 (4): 452–473

[2] Yang, Zhilin, William Cohen, and Ruslan Salakhudinov. "Revisiting semi-supervised learning with graph embeddings." International conference on machine learning. PMLR, 2016.

[3] Carlson, Andrew, Justin Betteridge, Bryan Kisiel, Burr Settles, Estevam Hruschka, and Tom Mitchell. "Toward an architecture for never-ending language learning." In Proceedings of the AAAI conference on artificial intelligence, vol. 24, no. 1, pp. 1306-1313. 2010.

[4] Pei, Hongbin, Bingzhe Wei, Kevin Chen-Chuan Chang, Yu Lei, and Bo Yang. "Geom-GCN: Geometric Graph Convolutional Networks." In International Conference on Learning Representations. 2019.

[5] Mernyei, Péter, and Cătălina Cangea. "Wiki-cs: A wikipedia-based benchmark for graph neural networks." arXiv preprint arXiv:2007.02901 (2020).

[6] Figueiredo, D.R., Ribeiro, L.F.R. and Saverese, P.H., 2017. struc2vec: Learning node representations from structural identity. CoRR, vol. abs/1704.03165.

[7] Jin, Y., Song, G. and Shi, C., 2020, April. GraLSP: Graph neural networks with local structural patterns. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 34, No. 04, pp. 4361-4368).

[8] Newman, M.E., 2006. Modularity and community structure in networks. Proceedings of the national academy of sciences, 103(23), pp.8577-8582.

Parameters

name : string, {'karate','cora','citeseer','pubmed','webkb_cornell','webkb_texas','webkb_wisconsin','nell','wikics'}: Name of dataset.

Returns

G : graphlearning graph object: Graph object with weight matrix and labels/features if available

Expand source code

def load_graph(name):
    """Load graph
    ======

    Load a graph. Currently implemented graphs include

    1. [karate](https://en.wikipedia.org/wiki/Zachary's_karate_club): Zachary's karate club [1]
    2. [cora](https://proceedings.mlr.press/v48/yanga16): Cora citation graph [2]
    3. [citeseer](https://proceedings.mlr.press/v48/yanga16): CiteSeer citation graph [2]
    4. [pubmed](https://proceedings.mlr.press/v48/yanga16): PubMed citation graph [2]
    5. [webkb_cornell](https://openreview.net/forum?id=S1e2agrFvS): WebKB Cornell graph [4]
    6. [webkb_texas](https://openreview.net/forum?id=S1e2agrFvS): WebKB Texas graph [4]
    7. [webkb_wisconsin](https://openreview.net/forum?id=S1e2agrFvS): WebKB Wisconsin graph [4]
    8. [nell](https://proceedings.mlr.press/v48/yanga16): The NELL knowledge graph [2,3]
    9. [wikics](https://arxiv.org/abs/2007.02901): The Wiki-CS graph [5]
    10. [airports_usa](https://arxiv.org/abs/1704.03165): The USA airports graph [6,7]
    11. [airports_brazil](https://arxiv.org/abs/1704.03165): The Brazil airports graph [6,7]
    12. [airports_europe](https://arxiv.org/abs/1704.03165): The Europe airports graph [6,7]
    13. [polbooks](https://www.pnas.org/doi/full/10.1073/pnas.0601602103): The Political books graph [8]

    [1] Zachary, W. W. (1977). "An Information Flow Model for Conflict and Fission in Small Groups". Journal of Anthropological Research. 33 (4): 452–473

    [2] Yang, Zhilin, William Cohen, and Ruslan Salakhudinov. "Revisiting semi-supervised learning with graph embeddings." International conference on machine learning. PMLR, 2016.

    [3] Carlson, Andrew, Justin Betteridge, Bryan Kisiel, Burr Settles, Estevam Hruschka, and Tom Mitchell. "Toward an architecture for never-ending language learning." In Proceedings of the AAAI conference on artificial intelligence, vol. 24, no. 1, pp. 1306-1313. 2010.

    [4] Pei, Hongbin, Bingzhe Wei, Kevin Chen-Chuan Chang, Yu Lei, and Bo Yang. "Geom-GCN: Geometric Graph Convolutional Networks." In International Conference on Learning Representations. 2019.

    [5] Mernyei, Péter, and Cătălina Cangea. "Wiki-cs: A wikipedia-based benchmark for graph neural networks." arXiv preprint arXiv:2007.02901 (2020).

    [6] Figueiredo, D.R., Ribeiro, L.F.R. and Saverese, P.H., 2017. struc2vec: Learning node representations from structural identity. CoRR, vol. abs/1704.03165.

    [7] Jin, Y., Song, G. and Shi, C., 2020, April. GraLSP: Graph neural networks with local structural patterns. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 34, No. 04, pp. 4361-4368).
  
    [8] Newman, M.E., 2006. Modularity and community structure in networks. Proceedings of the national academy of sciences, 103(23), pp.8577-8582.

    Parameters
    ----------
    name : string, {'karate','cora','citeseer','pubmed','webkb_cornell','webkb_texas','webkb_wisconsin','nell','wikics'}
        Name of dataset.
    
    Returns
    -------
    G : graphlearning graph object
        Graph object with weight matrix and labels/features if available
    """

    #Dataset filename
    dataFile = name.lower()+".pkl"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download dataset file if needed
    if not os.path.exists(dataFile_path):
        urlpath = 'http://www-users.math.umn.edu/~jwcalder/Data/'+dataFile
        utils.download_file(urlpath, dataFile_path)

    return graph.graph.load(dataFile_path[:-4])

def load_image(name)

Load image

Load an image.

Parameters

name : string: Name of image, choices are {'cameraman', 'cow', 'house', 'jetplane', 'lake', 'mandril_color', 'mandril_gray', 'peppers_color', 'peppers_gray', 'pirate', 'walkbridge', 'chairtoy', 'chairtoy_highres','chairtoy_bw', 'chairtoy_highres_bw'}

Returns

image : numpy array, float: (m,n) or (m,n,3) numpy array containing image.

Expand source code

def load_image(name):
    """Load image 
    ======

    Load an image. 

    Parameters
    ----------
    name : string
        Name of image, choices are {'cameraman', 'cow', 'house', 'jetplane', 'lake', 'mandril_color', 'mandril_gray', 'peppers_color', 'peppers_gray', 'pirate', 'walkbridge', 'chairtoy', 'chairtoy_highres','chairtoy_bw', 'chairtoy_highres_bw'}
    Returns
    -------
    image : numpy array, float
        (m,n) or (m,n,3) numpy array containing image.
    """


    #Dataset filename
    dataFile = name.lower()+'.png'

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download image file if needed
    if not os.path.exists(dataFile_path):
        urlpath = 'http://www-users.math.umn.edu/~jwcalder/TestImages/'+dataFile
        utils.download_file(urlpath, dataFile_path)

    #Load image
    image = plt.imread(dataFile_path)

    return image

def save(data, labels, dataset, metric='raw', overwrite=False)

Save dataset

Add a new dataset to graph learning by saving the data and labels.

Parameters

data : (n,m) numpy array, float: n data points in m dimensions.
labels : Length n numpy array, int: Integer values for labels.
dataset : string: Name of dataset.
metric : string (optional), default='raw': A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive).
overwrite : bool (optional), default=False: Whether to overwrite if dataset already exists.

Expand source code

def save(data, labels, dataset, metric='raw', overwrite=False):
    """Save dataset
    ======

    Add a new dataset to graph learning by saving the data and labels.
   
    Parameters
    ----------
    data : (n,m) numpy array, float
        n data points in m dimensions.
    labels : Length n numpy array, int
        Integer values for labels. 
    dataset : string
        Name of dataset.
    metric : string (optional), default='raw'
        A modifier to add to the dataset name when saving, to distinguish 
        different types of knn data (not case-sensitive).
    overwrite : bool (optional), default=False
        Whether to overwrite if dataset already exists.
    """

    #Dataset filename
    dataFile = dataset.lower()+"_"+metric.lower()+".npz"
    labelsFile = dataset.lower()+"_labels.npz"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)
    labelsFile_path = os.path.join(data_dir, labelsFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Save dataset and labels
    if os.path.isfile(dataFile_path) and not overwrite:
        print('Data file '+dataFile_path+' already exists. Not saving.')
    else:
        np.savez_compressed(dataFile_path,data=data)
        np.savez_compressed(labelsFile_path,labels=labels)

def two_skies(n, sigma=0.15, sep=0.64)

Two skies dataset

Random sample from the two skies dataset.

Returns

n : int: Number of data points (should be even).
sigma : float (optional): Standard deviation of the skies.
sep : float (optional): Separation between the two skies.

Returns

data : numpy array, float: (n,2) numpy array of data.
labels : numpy array, int: Binary labels indicating two skies.

Expand source code

def two_skies(n,sigma=0.15,sep=0.64):
    """Two skies dataset
    ======

    Random sample from the two skies dataset.

    Returns
    -------
    n : int
        Number of data points (should be even).
    sigma : float (optional)
        Standard deviation of the skies.
    sep : float (optional)
        Separation between the two skies.
        

    Returns
    -------
    data : numpy array, float
        (n,2) numpy array of data.
    labels : numpy array, int
        Binary labels indicating two skies.

    """

    m = int(n/2)
    y1 = sigma*np.random.randn(m,1) + sep/2
    y2 = sigma*np.random.randn(m,1) - sep/2
    y = np.vstack((y1,y2))
    x = np.random.rand(2*m,1)
    labels = np.vstack((np.zeros(m),np.ones(m)))
    data = np.hstack((x,y))
    return data,labels