Module graphlearning.datasets
Datasets
This module allows for loading standard datasets (currently mnist, fashionmnist, cifar), and creating and saving new datasets by name locally.
Expand source code
"""
Datasets
==========
This module allows for loading standard datasets (currently mnist, fashionmnist, cifar), and creating
and saving new datasets by name locally.
"""
import numpy as np
import ssl
import os
from . import utils
#Directory for storing datasets
data_dir = os.path.abspath(os.path.join(os.getcwd(),'data'))
def two_skies(n,sigma=0.15,sep=0.64):
"""Two skies dataset
======
Random sample from the two skies dataset.
Returns
-------
n : int
Number of data points (should be even).
sigma : float (optional)
Standard deviation of the skies.
sep : float (optional)
Separation between the two skies.
Returns
-------
data : numpy array, float
(n,2) numpy array of data.
labels : numpy array, int
Binary labels indicating two skies.
"""
m = int(n/2)
y1 = sigma*np.random.randn(m,1) + sep/2
y2 = sigma*np.random.randn(m,1) - sep/2
y = np.vstack((y1,y2))
x = np.random.rand(2*m,1)
labels = np.vstack((np.zeros(m),np.ones(m)))
data = np.hstack((x,y))
return data,labels
def save(data, labels, dataset, metric='raw', overwrite=False):
"""Save dataset
======
Add a new dataset to graph learning by saving the data and labels.
Parameters
----------
data : (n,m) numpy array, float
n data points in m dimensions.
labels : Length n numpy array, int
Integer values for labels.
dataset : string
Name of dataset.
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish
different types of knn data (not case-sensitive).
overwrite : bool (optional), default=False
Whether to overwrite if dataset already exists.
"""
#Dataset filename
dataFile = dataset.lower()+"_"+metric.lower()+".npz"
labelsFile = dataset.lower()+"_labels.npz"
#Full path to file
dataFile_path = os.path.join(data_dir, dataFile)
labelsFile_path = os.path.join(data_dir, labelsFile)
#Check if Data directory exists
if not os.path.exists(data_dir):
os.makedirs(data_dir)
#Save dataset and labels
if os.path.isfile(dataFile_path) and not overwrite:
print('Data file '+dataFile_path+' already exists. Not saving.')
else:
np.savez_compressed(dataFile_path,data=data)
np.savez_compressed(labelsFile_path,labels=labels)
def load(dataset, metric='raw', labels_only=False):
"""Load dataset
======
Load a dataset. Currently implemented datasets include
1. [mnist](http://yann.lecun.com/exdb/mnist/): metrics are 'raw' and 'vae' (variational autoencoder)
2. [fashionmnist](https://github.com/zalandoresearch/fashion-mnist): metrics are 'raw' and 'vae'
3. [cifar](https://www.cs.toronto.edu/~kriz/cifar.html): metrics are 'raw' and 'aet' (autoencoding transformations). Loads CIFAR-10.
Parameters
----------
dataset : string, {'mnist', 'fashionmnist', 'cifar'}
Name of dataset.
metric : string (optional), default='raw'
Indicates the embedding method used in the graph construction. For example, dataset='mnist' with
metric='vae' loads the latent features from a variational autoencoder trained on MNIST.
labels_only : bool (optional), default=False
Whether to return only the labels. Useful if the dataset is very large and knndata is already
precomputed, so the raw features are not needed.
Returns
-------
data : numpy array, float
(n,d) numpy array of n datapoints in dimension d. Not returned if `labels_only=True`.
labels : numpy array, int
Integer-valued labels in range 0 through k-1, where k is the number of classes.
"""
#Dataset filename
dataFile = dataset.lower()+"_"+metric.lower()+".npz"
labelsFile = dataset.lower()+"_labels.npz"
#Full path to file
dataFile_path = os.path.join(data_dir, dataFile)
labelsFile_path = os.path.join(data_dir, labelsFile)
#Check if Data directory exists
if not os.path.exists(data_dir):
os.makedirs(data_dir)
#Download labels file if needed
if not os.path.exists(labelsFile_path):
urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/Data/'+labelsFile
utils.download_file(urlpath, labelsFile_path)
#Load labels from npz file
labels = utils.numpy_load(labelsFile_path, 'labels')
if labels_only:
return labels
else:
#Download dataset file if needed
if not os.path.exists(dataFile_path):
urlpath = 'http://www-users.math.umn.edu/~jwcalder/Data/'+dataFile
utils.download_file(urlpath, dataFile_path)
data = utils.numpy_load(dataFile_path, 'data')
return data, labels
Functions
def load(dataset, metric='raw', labels_only=False)
-
Load dataset
Load a dataset. Currently implemented datasets include
- mnist: metrics are 'raw' and 'vae' (variational autoencoder)
- fashionmnist: metrics are 'raw' and 'vae'
- cifar: metrics are 'raw' and 'aet' (autoencoding transformations). Loads CIFAR-10.
Parameters
dataset
:string, {'mnist', 'fashionmnist', 'cifar'}
- Name of dataset.
metric
:string (optional)
, default='raw'
- Indicates the embedding method used in the graph construction. For example, dataset='mnist' with metric='vae' loads the latent features from a variational autoencoder trained on MNIST.
labels_only
:bool (optional)
, default=False
- Whether to return only the labels. Useful if the dataset is very large and knndata is already precomputed, so the raw features are not needed.
Returns
data
:numpy array, float
- (n,d) numpy array of n datapoints in dimension d. Not returned if
labels_only=True
. labels
:numpy array, int
- Integer-valued labels in range 0 through k-1, where k is the number of classes.
Expand source code
def load(dataset, metric='raw', labels_only=False): """Load dataset ====== Load a dataset. Currently implemented datasets include 1. [mnist](http://yann.lecun.com/exdb/mnist/): metrics are 'raw' and 'vae' (variational autoencoder) 2. [fashionmnist](https://github.com/zalandoresearch/fashion-mnist): metrics are 'raw' and 'vae' 3. [cifar](https://www.cs.toronto.edu/~kriz/cifar.html): metrics are 'raw' and 'aet' (autoencoding transformations). Loads CIFAR-10. Parameters ---------- dataset : string, {'mnist', 'fashionmnist', 'cifar'} Name of dataset. metric : string (optional), default='raw' Indicates the embedding method used in the graph construction. For example, dataset='mnist' with metric='vae' loads the latent features from a variational autoencoder trained on MNIST. labels_only : bool (optional), default=False Whether to return only the labels. Useful if the dataset is very large and knndata is already precomputed, so the raw features are not needed. Returns ------- data : numpy array, float (n,d) numpy array of n datapoints in dimension d. Not returned if `labels_only=True`. labels : numpy array, int Integer-valued labels in range 0 through k-1, where k is the number of classes. """ #Dataset filename dataFile = dataset.lower()+"_"+metric.lower()+".npz" labelsFile = dataset.lower()+"_labels.npz" #Full path to file dataFile_path = os.path.join(data_dir, dataFile) labelsFile_path = os.path.join(data_dir, labelsFile) #Check if Data directory exists if not os.path.exists(data_dir): os.makedirs(data_dir) #Download labels file if needed if not os.path.exists(labelsFile_path): urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/Data/'+labelsFile utils.download_file(urlpath, labelsFile_path) #Load labels from npz file labels = utils.numpy_load(labelsFile_path, 'labels') if labels_only: return labels else: #Download dataset file if needed if not os.path.exists(dataFile_path): urlpath = 'http://www-users.math.umn.edu/~jwcalder/Data/'+dataFile utils.download_file(urlpath, dataFile_path) data = utils.numpy_load(dataFile_path, 'data') return data, labels
def save(data, labels, dataset, metric='raw', overwrite=False)
-
Save dataset
Add a new dataset to graph learning by saving the data and labels.
Parameters
data
:(n,m) numpy array, float
- n data points in m dimensions.
labels
:Length n numpy array, int
- Integer values for labels.
dataset
:string
- Name of dataset.
metric
:string (optional)
, default='raw'
- A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive).
overwrite
:bool (optional)
, default=False
- Whether to overwrite if dataset already exists.
Expand source code
def save(data, labels, dataset, metric='raw', overwrite=False): """Save dataset ====== Add a new dataset to graph learning by saving the data and labels. Parameters ---------- data : (n,m) numpy array, float n data points in m dimensions. labels : Length n numpy array, int Integer values for labels. dataset : string Name of dataset. metric : string (optional), default='raw' A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive). overwrite : bool (optional), default=False Whether to overwrite if dataset already exists. """ #Dataset filename dataFile = dataset.lower()+"_"+metric.lower()+".npz" labelsFile = dataset.lower()+"_labels.npz" #Full path to file dataFile_path = os.path.join(data_dir, dataFile) labelsFile_path = os.path.join(data_dir, labelsFile) #Check if Data directory exists if not os.path.exists(data_dir): os.makedirs(data_dir) #Save dataset and labels if os.path.isfile(dataFile_path) and not overwrite: print('Data file '+dataFile_path+' already exists. Not saving.') else: np.savez_compressed(dataFile_path,data=data) np.savez_compressed(labelsFile_path,labels=labels)
def two_skies(n, sigma=0.15, sep=0.64)
-
Two skies dataset
Random sample from the two skies dataset.
Returns
n
:int
- Number of data points (should be even).
sigma
:float (optional)
- Standard deviation of the skies.
sep
:float (optional)
- Separation between the two skies.
Returns
data
:numpy array, float
- (n,2) numpy array of data.
labels
:numpy array, int
- Binary labels indicating two skies.
Expand source code
def two_skies(n,sigma=0.15,sep=0.64): """Two skies dataset ====== Random sample from the two skies dataset. Returns ------- n : int Number of data points (should be even). sigma : float (optional) Standard deviation of the skies. sep : float (optional) Separation between the two skies. Returns ------- data : numpy array, float (n,2) numpy array of data. labels : numpy array, int Binary labels indicating two skies. """ m = int(n/2) y1 = sigma*np.random.randn(m,1) + sep/2 y2 = sigma*np.random.randn(m,1) - sep/2 y = np.vstack((y1,y2)) x = np.random.rand(2*m,1) labels = np.vstack((np.zeros(m),np.ones(m))) data = np.hstack((x,y)) return data,labels