Module graphlearning.datasets

Datasets

This module allows for loading standard datasets (currently mnist, fashionmnist, cifar10, signmnist), and creating and saving new datasets by name locally.

Expand source code
"""
Datasets
==========

This module allows for loading standard datasets (currently mnist, fashionmnist, cifar10, signmnist), and creating
and saving new datasets by name locally.
"""

import numpy as np
import ssl
import os
import matplotlib.pyplot as plt
from . import utils

#Directory for storing datasets
data_dir = os.path.abspath(os.path.join(os.getcwd(),'data'))

def two_skies(n,sigma=0.15,sep=0.64):
    """Two skies dataset
    ======

    Random sample from the two skies dataset.

    Returns
    -------
    n : int
        Number of data points (should be even).
    sigma : float (optional)
        Standard deviation of the skies.
    sep : float (optional)
        Separation between the two skies.
        

    Returns
    -------
    data : numpy array, float
        (n,2) numpy array of data.
    labels : numpy array, int
        Binary labels indicating two skies.

    """

    m = int(n/2)
    y1 = sigma*np.random.randn(m,1) + sep/2
    y2 = sigma*np.random.randn(m,1) - sep/2
    y = np.vstack((y1,y2))
    x = np.random.rand(2*m,1)
    labels = np.vstack((np.zeros(m),np.ones(m)))
    data = np.hstack((x,y))
    return data,labels


def save(data, labels, dataset, metric='raw', overwrite=False):
    """Save dataset
    ======

    Add a new dataset to graph learning by saving the data and labels.
   
    Parameters
    ----------
    data : (n,m) numpy array, float
        n data points in m dimensions.
    labels : Length n numpy array, int
        Integer values for labels. 
    dataset : string
        Name of dataset.
    metric : string (optional), default='raw'
        A modifier to add to the dataset name when saving, to distinguish 
        different types of knn data (not case-sensitive).
    overwrite : bool (optional), default=False
        Whether to overwrite if dataset already exists.
    """

    #Dataset filename
    dataFile = dataset.lower()+"_"+metric.lower()+".npz"
    labelsFile = dataset.lower()+"_labels.npz"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)
    labelsFile_path = os.path.join(data_dir, labelsFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Save dataset and labels
    if os.path.isfile(dataFile_path) and not overwrite:
        print('Data file '+dataFile_path+' already exists. Not saving.')
    else:
        np.savez_compressed(dataFile_path,data=data)
        np.savez_compressed(labelsFile_path,labels=labels)


def load(dataset, metric='raw', labels_only=False):
    """Load dataset
    ======

    Load a dataset. Currently implemented datasets include

    1. [mnist](http://yann.lecun.com/exdb/mnist/): metrics are 'raw' and 'vae' (variational autoencoder)
    2. [fashionmnist](https://github.com/zalandoresearch/fashion-mnist): metrics are 'raw' and 'vae' 
    3. [cifar10](https://www.cs.toronto.edu/~kriz/cifar.html): metrics are 'raw' and 'simclr'. Loads CIFAR-10.
    4. [yalefaces](https://paperswithcode.com/dataset/extended-yale-b-1): Only metric is 'raw'.
    5. [signmnist](https://www.kaggle.com/datasets/datamunge/sign-language-mnist): Sign language version of MNIST.
   
    Parameters
    ----------
    dataset : string, {'mnist', 'fashionmnist', 'cifar'}
        Name of dataset.
    metric : string (optional), default='raw'
        Indicates the embedding method used in the graph construction. For example, dataset='mnist' with
        metric='vae' loads the latent features from a variational autoencoder trained on MNIST.
    labels_only : bool (optional), default=False
        Whether to return only the labels. Useful if the dataset is very large and knndata is already 
        precomputed, so the raw features are not needed.
    
    Returns
    -------
    data : numpy array, float
        (n,d) numpy array of n datapoints in dimension d. Not returned if `labels_only=True`.
    labels : numpy array, int
        Integer-valued labels in range 0 through k-1, where k is the number of classes.
    """

    #Dataset filename
    dataFile = dataset.lower()+"_"+metric.lower()+".npz"
    labelsFile = dataset.lower()+"_labels.npz"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)
    labelsFile_path = os.path.join(data_dir, labelsFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download labels file if needed
    if not os.path.exists(labelsFile_path):
        urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/Data/'+labelsFile
        utils.download_file(urlpath, labelsFile_path)

    #Load labels from npz file
    labels = utils.numpy_load(labelsFile_path, 'labels')

    if labels_only:
        return labels
    else:
        #Download dataset file if needed
        if not os.path.exists(dataFile_path):
            urlpath = 'http://www-users.math.umn.edu/~jwcalder/Data/'+dataFile
            utils.download_file(urlpath, dataFile_path)

        data = utils.numpy_load(dataFile_path, 'data')
        return data, labels

def load_image(name):
    """Load image 
    ======

    Load an image. 

    Parameters
    ----------
    name : string
        Name of image, choices are {'cameraman', 'cow', 'house', 'jetplane', 'lake', 'mandril_color', 'mandril_gray', 'peppers_color', 'peppers_gray', 'pirate', 'walkbridge', 'chairtoy', 'chairtoy_highres','chairtoy_bw', 'chairtoy_highres_bw'}
    Returns
    -------
    image : numpy array, float
        (m,n) or (m,n,3) numpy array containing image.
    """


    #Dataset filename
    dataFile = name.lower()+'.png'

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download image file if needed
    if not os.path.exists(dataFile_path):
        urlpath = 'http://www-users.math.umn.edu/~jwcalder/TestImages/'+dataFile
        utils.download_file(urlpath, dataFile_path)

    #Load image
    image = plt.imread(dataFile_path)

    return image

Functions

def load(dataset, metric='raw', labels_only=False)

Load dataset

Load a dataset. Currently implemented datasets include

  1. mnist: metrics are 'raw' and 'vae' (variational autoencoder)
  2. fashionmnist: metrics are 'raw' and 'vae'
  3. cifar10: metrics are 'raw' and 'simclr'. Loads CIFAR-10.
  4. yalefaces: Only metric is 'raw'.
  5. signmnist: Sign language version of MNIST.

Parameters

dataset : string, {'mnist', 'fashionmnist', 'cifar'}
Name of dataset.
metric : string (optional), default='raw'
Indicates the embedding method used in the graph construction. For example, dataset='mnist' with metric='vae' loads the latent features from a variational autoencoder trained on MNIST.
labels_only : bool (optional), default=False
Whether to return only the labels. Useful if the dataset is very large and knndata is already precomputed, so the raw features are not needed.

Returns

data : numpy array, float
(n,d) numpy array of n datapoints in dimension d. Not returned if labels_only=True.
labels : numpy array, int
Integer-valued labels in range 0 through k-1, where k is the number of classes.
Expand source code
def load(dataset, metric='raw', labels_only=False):
    """Load dataset
    ======

    Load a dataset. Currently implemented datasets include

    1. [mnist](http://yann.lecun.com/exdb/mnist/): metrics are 'raw' and 'vae' (variational autoencoder)
    2. [fashionmnist](https://github.com/zalandoresearch/fashion-mnist): metrics are 'raw' and 'vae' 
    3. [cifar10](https://www.cs.toronto.edu/~kriz/cifar.html): metrics are 'raw' and 'simclr'. Loads CIFAR-10.
    4. [yalefaces](https://paperswithcode.com/dataset/extended-yale-b-1): Only metric is 'raw'.
    5. [signmnist](https://www.kaggle.com/datasets/datamunge/sign-language-mnist): Sign language version of MNIST.
   
    Parameters
    ----------
    dataset : string, {'mnist', 'fashionmnist', 'cifar'}
        Name of dataset.
    metric : string (optional), default='raw'
        Indicates the embedding method used in the graph construction. For example, dataset='mnist' with
        metric='vae' loads the latent features from a variational autoencoder trained on MNIST.
    labels_only : bool (optional), default=False
        Whether to return only the labels. Useful if the dataset is very large and knndata is already 
        precomputed, so the raw features are not needed.
    
    Returns
    -------
    data : numpy array, float
        (n,d) numpy array of n datapoints in dimension d. Not returned if `labels_only=True`.
    labels : numpy array, int
        Integer-valued labels in range 0 through k-1, where k is the number of classes.
    """

    #Dataset filename
    dataFile = dataset.lower()+"_"+metric.lower()+".npz"
    labelsFile = dataset.lower()+"_labels.npz"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)
    labelsFile_path = os.path.join(data_dir, labelsFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download labels file if needed
    if not os.path.exists(labelsFile_path):
        urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/Data/'+labelsFile
        utils.download_file(urlpath, labelsFile_path)

    #Load labels from npz file
    labels = utils.numpy_load(labelsFile_path, 'labels')

    if labels_only:
        return labels
    else:
        #Download dataset file if needed
        if not os.path.exists(dataFile_path):
            urlpath = 'http://www-users.math.umn.edu/~jwcalder/Data/'+dataFile
            utils.download_file(urlpath, dataFile_path)

        data = utils.numpy_load(dataFile_path, 'data')
        return data, labels
def load_image(name)

Load image

Load an image.

Parameters

name : string
Name of image, choices are {'cameraman', 'cow', 'house', 'jetplane', 'lake', 'mandril_color', 'mandril_gray', 'peppers_color', 'peppers_gray', 'pirate', 'walkbridge', 'chairtoy', 'chairtoy_highres','chairtoy_bw', 'chairtoy_highres_bw'}

Returns

image : numpy array, float
(m,n) or (m,n,3) numpy array containing image.
Expand source code
def load_image(name):
    """Load image 
    ======

    Load an image. 

    Parameters
    ----------
    name : string
        Name of image, choices are {'cameraman', 'cow', 'house', 'jetplane', 'lake', 'mandril_color', 'mandril_gray', 'peppers_color', 'peppers_gray', 'pirate', 'walkbridge', 'chairtoy', 'chairtoy_highres','chairtoy_bw', 'chairtoy_highres_bw'}
    Returns
    -------
    image : numpy array, float
        (m,n) or (m,n,3) numpy array containing image.
    """


    #Dataset filename
    dataFile = name.lower()+'.png'

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Download image file if needed
    if not os.path.exists(dataFile_path):
        urlpath = 'http://www-users.math.umn.edu/~jwcalder/TestImages/'+dataFile
        utils.download_file(urlpath, dataFile_path)

    #Load image
    image = plt.imread(dataFile_path)

    return image
def save(data, labels, dataset, metric='raw', overwrite=False)

Save dataset

Add a new dataset to graph learning by saving the data and labels.

Parameters

data : (n,m) numpy array, float
n data points in m dimensions.
labels : Length n numpy array, int
Integer values for labels.
dataset : string
Name of dataset.
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive).
overwrite : bool (optional), default=False
Whether to overwrite if dataset already exists.
Expand source code
def save(data, labels, dataset, metric='raw', overwrite=False):
    """Save dataset
    ======

    Add a new dataset to graph learning by saving the data and labels.
   
    Parameters
    ----------
    data : (n,m) numpy array, float
        n data points in m dimensions.
    labels : Length n numpy array, int
        Integer values for labels. 
    dataset : string
        Name of dataset.
    metric : string (optional), default='raw'
        A modifier to add to the dataset name when saving, to distinguish 
        different types of knn data (not case-sensitive).
    overwrite : bool (optional), default=False
        Whether to overwrite if dataset already exists.
    """

    #Dataset filename
    dataFile = dataset.lower()+"_"+metric.lower()+".npz"
    labelsFile = dataset.lower()+"_labels.npz"

    #Full path to file
    dataFile_path = os.path.join(data_dir, dataFile)
    labelsFile_path = os.path.join(data_dir, labelsFile)

    #Check if Data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    #Save dataset and labels
    if os.path.isfile(dataFile_path) and not overwrite:
        print('Data file '+dataFile_path+' already exists. Not saving.')
    else:
        np.savez_compressed(dataFile_path,data=data)
        np.savez_compressed(labelsFile_path,labels=labels)
def two_skies(n, sigma=0.15, sep=0.64)

Two skies dataset

Random sample from the two skies dataset.

Returns

n : int
Number of data points (should be even).
sigma : float (optional)
Standard deviation of the skies.
sep : float (optional)
Separation between the two skies.

Returns

data : numpy array, float
(n,2) numpy array of data.
labels : numpy array, int
Binary labels indicating two skies.
Expand source code
def two_skies(n,sigma=0.15,sep=0.64):
    """Two skies dataset
    ======

    Random sample from the two skies dataset.

    Returns
    -------
    n : int
        Number of data points (should be even).
    sigma : float (optional)
        Standard deviation of the skies.
    sep : float (optional)
        Separation between the two skies.
        

    Returns
    -------
    data : numpy array, float
        (n,2) numpy array of data.
    labels : numpy array, int
        Binary labels indicating two skies.

    """

    m = int(n/2)
    y1 = sigma*np.random.randn(m,1) + sep/2
    y2 = sigma*np.random.randn(m,1) - sep/2
    y = np.vstack((y1,y2))
    x = np.random.rand(2*m,1)
    labels = np.vstack((np.zeros(m),np.ones(m)))
    data = np.hstack((x,y))
    return data,labels