Module graphlearning.weightmatrix
Weight Matrices
This module implements functions that are useful for constructing sparse weight matrices, including efficient high dimensional nearest neighbor searches.
Expand source code
"""
Weight Matrices
==========
This module implements functions that are useful for constructing sparse weight matrices, including
efficient high dimensional nearest neighbor searches.
"""
import numpy as np
from scipy import spatial
from scipy import sparse
import os
import sys
from . import utils
#Directory to store knn data
knn_dir = os.path.abspath(os.path.join(os.getcwd(),'knn_data'))
def grid_graph(n,m=None,return_xy=False):
'''Grid graph
========
Returns the adjacency matrix for a graph on a regular grid.
Parameters
----------
n : int
Number of pixels wide. Or if n is an image, then (n,m) are taken as the shape
of the first and second dimensions of the image.
m : int (Optional)
Number of pixels high. Does not need to be specified if n is an image.
return_xy : bool (optional, default=False)
Whether to return x,y coordinates as well
Returns
-------
W : (m*n,m*n) scipy sparse matrix
Weight matrix of nearest neighbor graph on (m,n) grid
X : (m*n,2) numpy array, float
Coordiantes of vertices of grid
'''
if m is None:
s = n.shape
m = s[1]
n = s[0]
xm, ym = np.meshgrid(np.arange(m),np.arange(n))
c = (xm + m*ym).flatten()
ne = (np.clip(xm + 1,0,m-1) + m*ym).flatten()
nw = (np.clip(xm - 1,0,m-1) + m*ym).flatten()
nn = (xm + m*np.clip(ym + 1,0,n-1)).flatten()
ns = (xm + m*np.clip(ym - 1,0,n-1)).flatten()
edges = np.vstack((c,ne)).T
edges = np.vstack((edges,np.vstack((c,nw)).T))
edges = np.vstack((edges,np.vstack((c,nn)).T))
edges = np.vstack((edges,np.vstack((c,ns)).T))
ind = edges[:,0] != edges[:,1]
edges = edges[ind,:]
W = sparse.coo_matrix((np.ones(len(edges)), (edges[:,0],edges[:,1])),shape=(m*n,m*n))
if return_xy:
X = np.vstack((xm.flatten(),ym.flatten())).T
return W.tocsr(),X.astype(float)
else:
return W.tocsr()
def knn(data, k, kernel='gaussian', eta=None, symmetrize=True, metric='raw', similarity='euclidean', knn_data=None):
"""knn weight matrix
======
General function for constructing knn weight matrices.
Parameters
----------
data : (n,m) numpy array, or string
If numpy array, n data points, each of dimension m, if string, then 'mnist', 'fashionmnist', or 'cifar'
k : int
Number of nearest neighbors to use.
kernel : string (optional), {'uniform','gaussian','symgaussian','singular','distance'}, default='gaussian'
The choice of kernel in computing the weights between \\(x_i\\) and each of its k
nearest neighbors. We let \\(d_k(x_i)\\) denote the distance from \\(x_i\\) to its kth
nearest neighbor. The choice 'uniform' corresponds to \\(w_{i,j}=1\\) and constitutes
an unweighted k nearest neighbor graph, 'gaussian' corresponds to
\\[ w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{d_k(x_i)^2} \\right), \\]
'symgaussian' corresponds to
\\[ w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{d_k(x_i)d_k(x_j)} \\right), \\]
'distance' corresponds to
\\[ w_{i,j} = \\|x_i - x_j\\|, \\]
and 'singular' corresponds to
\\[ w_{i,j} = \\frac{1}{\\|x_i - x_j\\|}, \\]
when \\(i\\neq j\\) and \\(w_{i,i}=1\\).
eta : python function handle (optional)
If provided, this overrides the kernel option and instead uses the weights
\\[ w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{d_k(x_i)^2} \\right), \\]
where \\(d_k(x_i)\\) is the distance from \\(x_i\\) to its kth nearest neighbor.
symmetrize : bool (optional), default=True, except when kernel='singular'
Whether or not to symmetrize the weight matrix before returning. Symmetrization is
performed by returning \\( (W + W^T)/2 \\), except for when kernel='distance','singular', in
which case the symmetrized edge weights are the true distances (or inverses), kernel='uniform',
where the weights are all 0/1, or kernel='symgaussian', where the same formula
is used for symmetry. Default for symmetrization is True, unless the kernel is
'singular', in which case it is False.
metric : string (optional), default='raw'
Metric identifier if data is a string (i.e., a dataset).
similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean'
Smilarity for nearest neighbor search.
knn_data : tuple (optional), default=None
If desired, the user can provide knn_data = (knn_ind, knn_dist), the output of a knnsearch,
in order to bypass the knnsearch step, which can be slow for large datasets.
Returns
-------
W : (n,n) scipy sparse matrix, float
Sparse weight matrix.
"""
#Self is counted in knn data, so add one
k += 1
#If knn_data provided
if knn_data is not None:
knn_ind, knn_dist = knn_data
#If data is a string, then load knn from a stored dataset
elif type(data) is str:
knn_ind, knn_dist = load_knn_data(data, metric=metric)
#Else we have to run a knnsearch
else:
knn_ind, knn_dist = knnsearch(data, k, similarity=similarity)
#Restrict to k nearest neighbors
n = knn_ind.shape[0]
k = np.minimum(knn_ind.shape[1],k)
knn_ind = knn_ind[:,:k]
knn_dist = knn_dist[:,:k]
#If eta is None, use kernel keyword
if eta is None:
if kernel == 'uniform':
weights = np.ones_like(knn_dist)
elif kernel == 'gaussian':
D = knn_dist*knn_dist
eps = D[:,k-1]
weights = np.exp(-4*D/eps[:,None])
elif kernel == 'symgaussian':
eps = knn_dist[:,k-1]
weights = np.exp(-4 * knn_dist * knn_dist / eps[:,None] / eps[knn_ind])
elif kernel == 'distance':
weights = knn_dist
elif kernel == 'singular':
weights = knn_dist
weights[knn_dist==0] = 1
weights = 1/weights
else:
sys.exit('Invalid choice of kernel: ' + kernel)
#Else use user-defined eta
else:
D = knn_dist*knn_dist
eps = D[:,k-1]
weights = eta(D/eps)
#Flatten knn data and weights
knn_ind = knn_ind.flatten()
weights = weights.flatten()
#Self indices
self_ind = np.ones((n,k))*np.arange(n)[:,None]
self_ind = self_ind.flatten()
#Construct sparse matrix and convert to Compressed Sparse Row (CSR) format
W = sparse.coo_matrix((weights, (self_ind, knn_ind)),shape=(n,n)).tocsr()
if symmetrize:
if kernel in ['distance','uniform','singular']:
W = utils.sparse_max(W, W.transpose())
elif kernel == 'symgaussian':
W = W + W.T.multiply(W.T > W) - W.multiply(W.T > W)
else:
W = (W + W.transpose())/2;
W.setdiag(0)
return W
def epsilon_ball(data, epsilon, kernel='gaussian', features=None, epsilon_f=1, eta=None):
"""Epsilon ball weight matrix
======
General function for constructing a sparse epsilon-ball weight matrix, whose weights have the form
\\[ w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \\]
when \\(\\|x_i - x_j\\|\\leq \\varepsilon\\), and \\(w_{i,j}=0\\) otherwise.
This type of weight matrix is only feasible in relatively low dimensions.
The diagonals are always zero.
Parameters
----------
data : (n,m) numpy array
n data points, each of dimension m
epsilon : float
Connectivity radius
kernel : string (optional), {'uniform','gaussian','singular','distance'}, default='gaussian'
The choice of kernel in computing the weights between \\(x_i\\) and \\(x_j\\) when
\\(\\|x_i-x_j\\|\\leq \\varepsilon\\). The choice 'uniform' corresponds to \\(w_{i,j}=1\\)
and constitutes an unweighted graph, 'gaussian' corresponds to
\\[ w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \\]
'distance' corresponds to
\\[ w_{i,j} = \\|x_i - x_j\\|, \\]
and 'singular' corresponds to
\\[ w_{i,j} = \\frac{1}{\\|x_i - x_j\\|}, \\]
when \\(i\\neq j\\) and \\(w_{i,i}=1\\).
features : (n,k) numpy array (optional)
If provided, then the weights are additionally multiplied by the similarity in features, so that
\\[ w_{i,j} = \\eta\\left(\\frac{\\|y_i - y_j\\|^2}{\\varepsilon_F^2} \\right)\\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \\]
when \\(\\|x_i - x_j\\|\\leq \\varepsilon\\), and \\(w_{i,j}=0\\) otherwise. The
vector \\(y_i\\) is the feature vector associated with datapoint i. The features
are useful for building a similarity graph over an image for image segmentation, and
here the \\(y_i\\) are either the pixel values at pixel i, or some other image feature
such as a texture indicator.
epsilon_f : float (optional).
Connectivity radius for features \\(\\varepsilon_F\\). Default is \\(\\varepsilon_F=1\\).
eta : python function handle (optional)
If provided, this overrides the kernel option and instead uses the weights
\\[ w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right). \\]
Returns
-------
W : (n,n) scipy sparse matrix, float
Sparse weight matrix.
"""
n = data.shape[0] #Number of points
#Rangesearch to find nearest neighbors
Xtree = spatial.cKDTree(data)
M = Xtree.query_pairs(epsilon)
M = np.array(list(M))
if len(M) == 0:
return sparse.csr_matrix((n,n))
else:
#Differences between points and neighbors
V = data[M[:,0],:] - data[M[:,1],:]
dists = np.sum(V*V,axis=1)
weights, fzero = __weights__(dists,epsilon,kernel,eta)
#Add differences in features
if features is not None:
VF = features[M[:,0],:] - features[M[:,1],:]
Fdists = np.sum(VF*VF,axis=1)
feature_weights, _ = __weights__(Fdists,epsilon_f,kernel,eta)
weights = weights*feature_weights
fzero = fzero**2
weights = np.concatenate((weights,weights,fzero*np.ones(n,)))
M1 = np.concatenate((M[:,0],M[:,1],np.arange(0,n)))
M2 = np.concatenate((M[:,1],M[:,0],np.arange(0,n)))
#Construct sparse matrix and convert to Compressed Sparse Row (CSR) format
W = sparse.coo_matrix((weights, (M1,M2)),shape=(n,n))
W.setdiag(0)
return W.tocsr()
def __weights__(dists,epsilon,kernel,eta):
#If eta is None, use kernel keyword
if eta is None:
if kernel == 'uniform':
weights = np.ones_like(dists)
fzero = 1
elif kernel == 'gaussian':
weights = np.exp(-4*dists/(epsilon*epsilon))
fzero = 1
elif kernel == 'distance':
weights = np.sqrt(dists)
fzero = 0
elif kernel == 'singular':
weights = np.sqrt(dists)
weights[dists==0] = 1
weights = 1/weights
fzero = 1
else:
sys.exit('Invalid choice of kernel: ' + kernel)
#Else use user-defined eta
else:
weights = eta(dists/(epsilon*epsilon))
fzero = eta(0)
return weights, fzero
def knnsearch(X, k, method=None, similarity='euclidean', dataset=None, metric='raw'):
"""knn search
======
General function for k-nearest neighbor searching, including efficient
implementations for high dimensional data, and support for saving
k-nn data to files automatically, for reuse later.
Parameters
----------
X : (n,m) numpy array
n data points, each of dimension m.
k : int
Number of nearest neighbors to find.
method : {'kdtree','annoy','brute'} (optional), default: 'kdtree' for m <=5 and 'annoy' for m>5
Algorithm for search. Annoy is an approximate nearest neighbor search and requires
the [Annoy](https://github.com/spotify/annoy) package.
similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean'
Smilarity for nearest neighbor search. Only 'euclidean' and 'angular' are available with
'kdtree' and 'brute'.
dataset : string (optional), default=None
If provided, results of the search are saved to a file that can be loaded later.
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish different types of knn data.
Returns
-------
knn_ind : (n,k) numpy array, int
Indices of nearest neighbors, including the self point.
knn_dist : (n,k) numpy array, float
Distances to all neighbors.
"""
n = X.shape[0]
m = X.shape[1]
if method is None:
if m <= 5:
method = 'kdtree'
else:
method = 'annoy'
if method in ['kdtree','brute']:
if not similarity in ['angular','euclidean']:
sys.exit('Invalid choice of similarity ' + similarity)
if similarity == 'angular':
Y = X/np.linalg.norm(X,axis=1)[:,None]
else:
Y = X
if method == 'kdtree':
Xtree = spatial.cKDTree(Y)
knn_dist, knn_ind = Xtree.query(Y,k=k)
else: #Brute force knn search
knn_ind = np.zeros((n,k),dtype=int)
knn_dist = np.zeros((n,k))
for i in range(n):
dist = np.linalg.norm(Y - Y[i,:],axis=1)
knn_ind[i,:] = np.argsort(dist)[:k]
knn_dist[i,:] = dist[knn_ind[i,:]]
elif method == 'annoy':
if not similarity in ['euclidean','angular','manhattan','hamming','dot']:
sys.exit('Invalid choice of similarity ' + similarity)
from annoy import AnnoyIndex
u = AnnoyIndex(m, similarity) # Length of item vector that will be indexed
for i in range(n):
u.add_item(i, X[i,:])
u.build(10) #10 trees
knn_dist = []
knn_ind = []
eps = 1e-20
for i in range(n):
#Get extra neighbors, in case there are mistakes
A = u.get_nns_by_item(i, min(2*k,n), include_distances=True)
ind = np.array(A[0])
#knn_dist.append(A[1]) #These distances are floating point (32-bit) precision
#The code below computes them more accurately
if similarity == 'euclidean':
dist = np.linalg.norm(X[i,:] - X[ind,:],axis=1)
elif similarity == 'angular':
vi = X[i,:]/np.maximum(np.linalg.norm(X[i,:]),eps)
vj = X[ind,:]/np.maximum(np.linalg.norm(X[ind,:],axis=1)[:,None],eps)
dist = np.linalg.norm(vi-vj,axis=1)
elif similarity == 'manhattan':
dist = np.linalg.norm(X[i,:] - X[ind,:],axis=1,ord=1)
elif similarity == 'hamming':
dist = A[1] #hamming is integer-valued, so no need to compute in double precision
elif similarity == 'dot':
dist = np.sum(X[i,:]*X[ind,:],axis=1)
else:
dist = A[1]
ind_sort = np.argsort(dist)[:k]
ind = ind[ind_sort]
dist = dist[ind_sort]
#print(np.max(np.absolute(dist - np.array(A[1]))))
knn_ind.append(ind)
knn_dist.append(dist)
knn_ind = np.array(knn_ind)
knn_dist = np.array(knn_dist)
else:
sys.exit('Invalid choice of knnsearch method ' + method)
#If dataset name is provided, save permutations to file
if not dataset is None:
#data file name
dataFile = dataset.lower() + '_' + metric.lower() + '.npz'
#Full path to file
dataFile_path = os.path.join(knn_dir, dataFile)
#Check if knn_dir exists
if not os.path.exists(knn_dir):
os.makedirs(knn_dir)
np.savez_compressed(dataFile_path, J=knn_ind, D=knn_dist)
return knn_ind, knn_dist
def load_knn_data(dataset, metric='raw'):
"""Load saved knn data
======
Loads the results of a saved knn search.
Parameters
----------
dataset : string
Name of dataset to load knn data for (not case-sensitive).
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive).
Returns
-------
knn_ind : (n,k) numpy array, int
Indices of nearest neighbors, including the self point.
knn_dist : (n,k) numpy array, float
Distances to all neighbors.
"""
dataFile = dataset.lower() + "_" + metric.lower() + ".npz"
dataFile_path = os.path.join(knn_dir, dataFile)
#Check if knn_dir exists
if not os.path.exists(knn_dir):
os.makedirs(knn_dir)
#Download kNN data if necessary
if not os.path.exists(dataFile_path):
urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/kNNData/'+dataFile
utils.download_file(urlpath, dataFile_path)
knn_ind = utils.numpy_load(dataFile_path, 'J')
knn_dist = utils.numpy_load(dataFile_path, 'D')
return knn_ind, knn_dist
def vae(data, layer_widths=[400,20], no_cuda=False, batch_size=128, epochs=100, learning_rate=1e-3):
"""Variational Autoencoder Embedding
======
Embeds a dataset via a two layer variational autoencoder (VAE) latent representation. The VAE
is essentially the original one from [1].
Parameters
----------
data : numpy array
(n,d) array of n datapoints in dimension d.
layer_widths : list of int, length=2 (optional), default=[400,20]
First element is the width of the hidden layer, while the second is the dimension
of the latent space.
no_cuda : bool (optional), default=False
Turn off GPU acceleration.
batch_size : int (optional), default=128
Batch size for gradient descent.
epochs : int (optional), default=100
How many epochs (loops over whole dataset) to train over.
learning_rate : float (optional), default=1e-3
Learning rate for optimizer.
Returns
-------
data_vae : numpy array
Data encoded by the VAE.
Example
-------
Using a VAE embedding to construct a similarity weight matrix on MNIST and running Poisson learning
at 1 label per class: [vae_mnist.py](https://github.com/jwcalder/GraphLearning/blob/master/examples/vae_mnist.py).
```py
import graphlearning as gl
data, labels = gl.datasets.load('mnist')
data_vae = gl.weightmatrix.vae(data)
W_raw = gl.weightmatrix.knn(data, 10)
W_vae = gl.weightmatrix.knn(data_vae, 10)
num_train_per_class = 1
train_ind = gl.trainsets.generate(labels, rate=num_train_per_class)
train_labels = labels[train_ind]
pred_labels_raw = gl.ssl.poisson(W_raw).fit_predict(train_ind,train_labels)
pred_labels_vae = gl.ssl.poisson(W_vae).fit_predict(train_ind,train_labels)
accuracy_raw = gl.ssl.ssl_accuracy(labels,pred_labels_raw,train_ind)
accuracy_vae = gl.ssl.ssl_accuracy(labels,pred_labels_vae,train_ind)
print('Raw Accuracy: %.2f%%'%accuracy_raw)
print('VAE Accuracy: %.2f%%'%accuracy_vae)
```
References
----------
[1] D.P. Kingma and M. Welling. [Auto-encoding variational bayes.](https://arxiv.org/abs/1312.6114) arXiv:1312.6114, 2014.
"""
import torch
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image
class MyDataset(Dataset):
def __init__(self, data, targets, transform=None):
self.data = data
self.targets = targets
self.transform = transform
def __getitem__(self, index):
x = self.data[index]
y = self.targets[index]
if self.transform:
x = self.transform(x)
return x, y
def __len__(self):
return len(self.data)
class VAE(nn.Module):
def __init__(self, layer_widths):
super(VAE, self).__init__()
self.lw = layer_widths
self.fc1 = nn.Linear(self.lw[0], self.lw[1])
self.fc21 = nn.Linear(self.lw[1], self.lw[2])
self.fc22 = nn.Linear(self.lw[1], self.lw[2])
self.fc3 = nn.Linear(self.lw[2], self.lw[1])
self.fc4 = nn.Linear(self.lw[1], self.lw[0])
def encode(self, x):
h1 = F.relu(self.fc1(x))
return self.fc21(h1), self.fc22(h1)
def reparameterize(self, mu, logvar):
std = torch.exp(0.5*logvar)
eps = torch.randn_like(std)
return mu + eps*std
def decode(self, z):
h3 = F.relu(self.fc3(z))
return torch.sigmoid(self.fc4(h3))
def forward(self, x):
mu, logvar = self.encode(x.view(-1, self.lw[0]))
z = self.reparameterize(mu, logvar)
return self.decode(z), mu, logvar
# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(recon_x, x, mu, logvar):
BCE = F.binary_cross_entropy(recon_x, x.view(-1, data.shape[1]), reduction='sum')
# see Appendix B from VAE paper:
# Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
# https://arxiv.org/abs/1312.6114
# 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return BCE + KLD
def train(epoch):
model.train()
train_loss = 0
for batch_idx, (data, _) in enumerate(data_loader):
data = data.to(device)
optimizer.zero_grad()
recon_batch, mu, logvar = model(data)
loss = loss_function(recon_batch, data, mu, logvar)
loss.backward()
train_loss += loss.item()
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(data_loader.dataset),
100. * batch_idx / len(data_loader),
loss.item() / len(data)))
print('====> Epoch: {} Average loss: {:.4f}'.format(
epoch, train_loss / len(data_loader.dataset)))
layer_widths = [data.shape[1]] + layer_widths
log_interval = 10 #how many batches to wait before logging training status
#GPU settings
cuda = not no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
#Convert to torch dataloaders
data = data - data.min()
data = data/data.max()
data = torch.from_numpy(data).float()
target = np.zeros((data.shape[0],)).astype(int)
target = torch.from_numpy(target).long()
dataset = MyDataset(data, target)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs)
#Put model on GPU and set up optimizer
model = VAE(layer_widths).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#Training epochs
for epoch in range(1, epochs + 1):
train(epoch)
#Encode the dataset and save to npz file
with torch.no_grad():
mu, logvar = model.encode(data.to(device).view(-1, layer_widths[0]))
data_vae = mu.cpu().numpy()
return data_vae
Functions
def epsilon_ball(data, epsilon, kernel='gaussian', features=None, epsilon_f=1, eta=None)
-
Epsilon ball weight matrix
General function for constructing a sparse epsilon-ball weight matrix, whose weights have the form w_{i,j} = \eta\left(\frac{\|x_i - x_j\|^2}{\varepsilon^2} \right), when \|x_i - x_j\|\leq \varepsilon, and w_{i,j}=0 otherwise. This type of weight matrix is only feasible in relatively low dimensions. The diagonals are always zero.
Parameters
data
:(n,m) numpy array
- n data points, each of dimension m
epsilon
:float
- Connectivity radius
kernel
:string (optional), {'uniform','gaussian','singular','distance'}
, default='gaussian'
- The choice of kernel in computing the weights between x_i and x_j when \|x_i-x_j\|\leq \varepsilon. The choice 'uniform' corresponds to w_{i,j}=1 and constitutes an unweighted graph, 'gaussian' corresponds to w_{i,j} = \exp\left(\frac{-4\|x_i - x_j\|^2}{\varepsilon^2} \right), 'distance' corresponds to w_{i,j} = \|x_i - x_j\|, and 'singular' corresponds to w_{i,j} = \frac{1}{\|x_i - x_j\|}, when i\neq j and w_{i,i}=1.
features
:(n,k) numpy array (optional)
- If provided, then the weights are additionally multiplied by the similarity in features, so that w_{i,j} = \eta\left(\frac{\|y_i - y_j\|^2}{\varepsilon_F^2} \right)\eta\left(\frac{\|x_i - x_j\|^2}{\varepsilon^2} \right), when \|x_i - x_j\|\leq \varepsilon, and w_{i,j}=0 otherwise. The vector y_i is the feature vector associated with datapoint i. The features are useful for building a similarity graph over an image for image segmentation, and here the y_i are either the pixel values at pixel i, or some other image feature such as a texture indicator.
- epsilon_f : float (optional).
- Connectivity radius for features \varepsilon_F. Default is \varepsilon_F=1.
eta
:python function handle (optional)
- If provided, this overrides the kernel option and instead uses the weights w_{i,j} = \eta\left(\frac{\|x_i - x_j\|^2}{\varepsilon^2} \right).
Returns
W
:(n,n) scipy sparse matrix, float
- Sparse weight matrix.
Expand source code
def epsilon_ball(data, epsilon, kernel='gaussian', features=None, epsilon_f=1, eta=None): """Epsilon ball weight matrix ====== General function for constructing a sparse epsilon-ball weight matrix, whose weights have the form \\[ w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \\] when \\(\\|x_i - x_j\\|\\leq \\varepsilon\\), and \\(w_{i,j}=0\\) otherwise. This type of weight matrix is only feasible in relatively low dimensions. The diagonals are always zero. Parameters ---------- data : (n,m) numpy array n data points, each of dimension m epsilon : float Connectivity radius kernel : string (optional), {'uniform','gaussian','singular','distance'}, default='gaussian' The choice of kernel in computing the weights between \\(x_i\\) and \\(x_j\\) when \\(\\|x_i-x_j\\|\\leq \\varepsilon\\). The choice 'uniform' corresponds to \\(w_{i,j}=1\\) and constitutes an unweighted graph, 'gaussian' corresponds to \\[ w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \\] 'distance' corresponds to \\[ w_{i,j} = \\|x_i - x_j\\|, \\] and 'singular' corresponds to \\[ w_{i,j} = \\frac{1}{\\|x_i - x_j\\|}, \\] when \\(i\\neq j\\) and \\(w_{i,i}=1\\). features : (n,k) numpy array (optional) If provided, then the weights are additionally multiplied by the similarity in features, so that \\[ w_{i,j} = \\eta\\left(\\frac{\\|y_i - y_j\\|^2}{\\varepsilon_F^2} \\right)\\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \\] when \\(\\|x_i - x_j\\|\\leq \\varepsilon\\), and \\(w_{i,j}=0\\) otherwise. The vector \\(y_i\\) is the feature vector associated with datapoint i. The features are useful for building a similarity graph over an image for image segmentation, and here the \\(y_i\\) are either the pixel values at pixel i, or some other image feature such as a texture indicator. epsilon_f : float (optional). Connectivity radius for features \\(\\varepsilon_F\\). Default is \\(\\varepsilon_F=1\\). eta : python function handle (optional) If provided, this overrides the kernel option and instead uses the weights \\[ w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right). \\] Returns ------- W : (n,n) scipy sparse matrix, float Sparse weight matrix. """ n = data.shape[0] #Number of points #Rangesearch to find nearest neighbors Xtree = spatial.cKDTree(data) M = Xtree.query_pairs(epsilon) M = np.array(list(M)) if len(M) == 0: return sparse.csr_matrix((n,n)) else: #Differences between points and neighbors V = data[M[:,0],:] - data[M[:,1],:] dists = np.sum(V*V,axis=1) weights, fzero = __weights__(dists,epsilon,kernel,eta) #Add differences in features if features is not None: VF = features[M[:,0],:] - features[M[:,1],:] Fdists = np.sum(VF*VF,axis=1) feature_weights, _ = __weights__(Fdists,epsilon_f,kernel,eta) weights = weights*feature_weights fzero = fzero**2 weights = np.concatenate((weights,weights,fzero*np.ones(n,))) M1 = np.concatenate((M[:,0],M[:,1],np.arange(0,n))) M2 = np.concatenate((M[:,1],M[:,0],np.arange(0,n))) #Construct sparse matrix and convert to Compressed Sparse Row (CSR) format W = sparse.coo_matrix((weights, (M1,M2)),shape=(n,n)) W.setdiag(0) return W.tocsr()
def grid_graph(n, m=None, return_xy=False)
-
Grid graph
Returns the adjacency matrix for a graph on a regular grid.
Parameters
n
:int
- Number of pixels wide. Or if n is an image, then (n,m) are taken as the shape of the first and second dimensions of the image.
m
:int (Optional)
- Number of pixels high. Does not need to be specified if n is an image.
return_xy
:bool (optional
, default=False)
- Whether to return x,y coordinates as well
Returns
W
:(m*n,m*n) scipy sparse matrix
- Weight matrix of nearest neighbor graph on (m,n) grid
X
:(m*n,2) numpy array, float
- Coordiantes of vertices of grid
Expand source code
def grid_graph(n,m=None,return_xy=False): '''Grid graph ======== Returns the adjacency matrix for a graph on a regular grid. Parameters ---------- n : int Number of pixels wide. Or if n is an image, then (n,m) are taken as the shape of the first and second dimensions of the image. m : int (Optional) Number of pixels high. Does not need to be specified if n is an image. return_xy : bool (optional, default=False) Whether to return x,y coordinates as well Returns ------- W : (m*n,m*n) scipy sparse matrix Weight matrix of nearest neighbor graph on (m,n) grid X : (m*n,2) numpy array, float Coordiantes of vertices of grid ''' if m is None: s = n.shape m = s[1] n = s[0] xm, ym = np.meshgrid(np.arange(m),np.arange(n)) c = (xm + m*ym).flatten() ne = (np.clip(xm + 1,0,m-1) + m*ym).flatten() nw = (np.clip(xm - 1,0,m-1) + m*ym).flatten() nn = (xm + m*np.clip(ym + 1,0,n-1)).flatten() ns = (xm + m*np.clip(ym - 1,0,n-1)).flatten() edges = np.vstack((c,ne)).T edges = np.vstack((edges,np.vstack((c,nw)).T)) edges = np.vstack((edges,np.vstack((c,nn)).T)) edges = np.vstack((edges,np.vstack((c,ns)).T)) ind = edges[:,0] != edges[:,1] edges = edges[ind,:] W = sparse.coo_matrix((np.ones(len(edges)), (edges[:,0],edges[:,1])),shape=(m*n,m*n)) if return_xy: X = np.vstack((xm.flatten(),ym.flatten())).T return W.tocsr(),X.astype(float) else: return W.tocsr()
def knn(data, k, kernel='gaussian', eta=None, symmetrize=True, metric='raw', similarity='euclidean', knn_data=None)
-
knn weight matrix
General function for constructing knn weight matrices.
Parameters
data
:(n,m) numpy array,
orstring
- If numpy array, n data points, each of dimension m, if string, then 'mnist', 'fashionmnist', or 'cifar'
k
:int
- Number of nearest neighbors to use.
kernel
:string (optional), {'uniform','gaussian','symgaussian','singular','distance'}
, default='gaussian'
- The choice of kernel in computing the weights between x_i and each of its k nearest neighbors. We let d_k(x_i) denote the distance from x_i to its kth nearest neighbor. The choice 'uniform' corresponds to w_{i,j}=1 and constitutes an unweighted k nearest neighbor graph, 'gaussian' corresponds to w_{i,j} = \exp\left(\frac{-4\|x_i - x_j\|^2}{d_k(x_i)^2} \right), 'symgaussian' corresponds to w_{i,j} = \exp\left(\frac{-4\|x_i - x_j\|^2}{d_k(x_i)d_k(x_j)} \right), 'distance' corresponds to w_{i,j} = \|x_i - x_j\|, and 'singular' corresponds to w_{i,j} = \frac{1}{\|x_i - x_j\|}, when i\neq j and w_{i,i}=1.
eta
:python function handle (optional)
- If provided, this overrides the kernel option and instead uses the weights w_{i,j} = \eta\left(\frac{\|x_i - x_j\|^2}{d_k(x_i)^2} \right), where d_k(x_i) is the distance from x_i to its kth nearest neighbor.
symmetrize
:bool (optional)
, default=True, except when kernel='singular'
- Whether or not to symmetrize the weight matrix before returning. Symmetrization is performed by returning (W + W^T)/2 , except for when kernel='distance','singular', in which case the symmetrized edge weights are the true distances (or inverses), kernel='uniform', where the weights are all 0/1, or kernel='symgaussian', where the same formula is used for symmetry. Default for symmetrization is True, unless the kernel is 'singular', in which case it is False.
metric
:string (optional)
, default='raw'
- Metric identifier if data is a string (i.e., a dataset).
similarity
:{'euclidean','angular','manhattan','hamming','dot'} (optional)
, default='euclidean'
- Smilarity for nearest neighbor search.
knn_data
:tuple (optional)
, default=None
- If desired, the user can provide knn_data = (knn_ind, knn_dist), the output of a knnsearch, in order to bypass the knnsearch step, which can be slow for large datasets.
Returns
W
:(n,n) scipy sparse matrix, float
- Sparse weight matrix.
Expand source code
def knn(data, k, kernel='gaussian', eta=None, symmetrize=True, metric='raw', similarity='euclidean', knn_data=None): """knn weight matrix ====== General function for constructing knn weight matrices. Parameters ---------- data : (n,m) numpy array, or string If numpy array, n data points, each of dimension m, if string, then 'mnist', 'fashionmnist', or 'cifar' k : int Number of nearest neighbors to use. kernel : string (optional), {'uniform','gaussian','symgaussian','singular','distance'}, default='gaussian' The choice of kernel in computing the weights between \\(x_i\\) and each of its k nearest neighbors. We let \\(d_k(x_i)\\) denote the distance from \\(x_i\\) to its kth nearest neighbor. The choice 'uniform' corresponds to \\(w_{i,j}=1\\) and constitutes an unweighted k nearest neighbor graph, 'gaussian' corresponds to \\[ w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{d_k(x_i)^2} \\right), \\] 'symgaussian' corresponds to \\[ w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{d_k(x_i)d_k(x_j)} \\right), \\] 'distance' corresponds to \\[ w_{i,j} = \\|x_i - x_j\\|, \\] and 'singular' corresponds to \\[ w_{i,j} = \\frac{1}{\\|x_i - x_j\\|}, \\] when \\(i\\neq j\\) and \\(w_{i,i}=1\\). eta : python function handle (optional) If provided, this overrides the kernel option and instead uses the weights \\[ w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{d_k(x_i)^2} \\right), \\] where \\(d_k(x_i)\\) is the distance from \\(x_i\\) to its kth nearest neighbor. symmetrize : bool (optional), default=True, except when kernel='singular' Whether or not to symmetrize the weight matrix before returning. Symmetrization is performed by returning \\( (W + W^T)/2 \\), except for when kernel='distance','singular', in which case the symmetrized edge weights are the true distances (or inverses), kernel='uniform', where the weights are all 0/1, or kernel='symgaussian', where the same formula is used for symmetry. Default for symmetrization is True, unless the kernel is 'singular', in which case it is False. metric : string (optional), default='raw' Metric identifier if data is a string (i.e., a dataset). similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean' Smilarity for nearest neighbor search. knn_data : tuple (optional), default=None If desired, the user can provide knn_data = (knn_ind, knn_dist), the output of a knnsearch, in order to bypass the knnsearch step, which can be slow for large datasets. Returns ------- W : (n,n) scipy sparse matrix, float Sparse weight matrix. """ #Self is counted in knn data, so add one k += 1 #If knn_data provided if knn_data is not None: knn_ind, knn_dist = knn_data #If data is a string, then load knn from a stored dataset elif type(data) is str: knn_ind, knn_dist = load_knn_data(data, metric=metric) #Else we have to run a knnsearch else: knn_ind, knn_dist = knnsearch(data, k, similarity=similarity) #Restrict to k nearest neighbors n = knn_ind.shape[0] k = np.minimum(knn_ind.shape[1],k) knn_ind = knn_ind[:,:k] knn_dist = knn_dist[:,:k] #If eta is None, use kernel keyword if eta is None: if kernel == 'uniform': weights = np.ones_like(knn_dist) elif kernel == 'gaussian': D = knn_dist*knn_dist eps = D[:,k-1] weights = np.exp(-4*D/eps[:,None]) elif kernel == 'symgaussian': eps = knn_dist[:,k-1] weights = np.exp(-4 * knn_dist * knn_dist / eps[:,None] / eps[knn_ind]) elif kernel == 'distance': weights = knn_dist elif kernel == 'singular': weights = knn_dist weights[knn_dist==0] = 1 weights = 1/weights else: sys.exit('Invalid choice of kernel: ' + kernel) #Else use user-defined eta else: D = knn_dist*knn_dist eps = D[:,k-1] weights = eta(D/eps) #Flatten knn data and weights knn_ind = knn_ind.flatten() weights = weights.flatten() #Self indices self_ind = np.ones((n,k))*np.arange(n)[:,None] self_ind = self_ind.flatten() #Construct sparse matrix and convert to Compressed Sparse Row (CSR) format W = sparse.coo_matrix((weights, (self_ind, knn_ind)),shape=(n,n)).tocsr() if symmetrize: if kernel in ['distance','uniform','singular']: W = utils.sparse_max(W, W.transpose()) elif kernel == 'symgaussian': W = W + W.T.multiply(W.T > W) - W.multiply(W.T > W) else: W = (W + W.transpose())/2; W.setdiag(0) return W
def knnsearch(X, k, method=None, similarity='euclidean', dataset=None, metric='raw')
-
knn search
General function for k-nearest neighbor searching, including efficient implementations for high dimensional data, and support for saving k-nn data to files automatically, for reuse later.
Parameters
X
:(n,m) numpy array
- n data points, each of dimension m.
k
:int
- Number of nearest neighbors to find.
method
:{'kdtree','annoy','brute'} (optional)
, default: 'kdtree' for m <=5 and 'annoy' for m>5
- Algorithm for search. Annoy is an approximate nearest neighbor search and requires the Annoy package.
similarity
:{'euclidean','angular','manhattan','hamming','dot'} (optional)
, default='euclidean'
- Smilarity for nearest neighbor search. Only 'euclidean' and 'angular' are available with 'kdtree' and 'brute'.
dataset
:string (optional)
, default=None
- If provided, results of the search are saved to a file that can be loaded later.
metric
:string (optional)
, default='raw'
- A modifier to add to the dataset name when saving, to distinguish different types of knn data.
Returns
knn_ind
:(n,k) numpy array, int
- Indices of nearest neighbors, including the self point.
knn_dist
:(n,k) numpy array, float
- Distances to all neighbors.
Expand source code
def knnsearch(X, k, method=None, similarity='euclidean', dataset=None, metric='raw'): """knn search ====== General function for k-nearest neighbor searching, including efficient implementations for high dimensional data, and support for saving k-nn data to files automatically, for reuse later. Parameters ---------- X : (n,m) numpy array n data points, each of dimension m. k : int Number of nearest neighbors to find. method : {'kdtree','annoy','brute'} (optional), default: 'kdtree' for m <=5 and 'annoy' for m>5 Algorithm for search. Annoy is an approximate nearest neighbor search and requires the [Annoy](https://github.com/spotify/annoy) package. similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean' Smilarity for nearest neighbor search. Only 'euclidean' and 'angular' are available with 'kdtree' and 'brute'. dataset : string (optional), default=None If provided, results of the search are saved to a file that can be loaded later. metric : string (optional), default='raw' A modifier to add to the dataset name when saving, to distinguish different types of knn data. Returns ------- knn_ind : (n,k) numpy array, int Indices of nearest neighbors, including the self point. knn_dist : (n,k) numpy array, float Distances to all neighbors. """ n = X.shape[0] m = X.shape[1] if method is None: if m <= 5: method = 'kdtree' else: method = 'annoy' if method in ['kdtree','brute']: if not similarity in ['angular','euclidean']: sys.exit('Invalid choice of similarity ' + similarity) if similarity == 'angular': Y = X/np.linalg.norm(X,axis=1)[:,None] else: Y = X if method == 'kdtree': Xtree = spatial.cKDTree(Y) knn_dist, knn_ind = Xtree.query(Y,k=k) else: #Brute force knn search knn_ind = np.zeros((n,k),dtype=int) knn_dist = np.zeros((n,k)) for i in range(n): dist = np.linalg.norm(Y - Y[i,:],axis=1) knn_ind[i,:] = np.argsort(dist)[:k] knn_dist[i,:] = dist[knn_ind[i,:]] elif method == 'annoy': if not similarity in ['euclidean','angular','manhattan','hamming','dot']: sys.exit('Invalid choice of similarity ' + similarity) from annoy import AnnoyIndex u = AnnoyIndex(m, similarity) # Length of item vector that will be indexed for i in range(n): u.add_item(i, X[i,:]) u.build(10) #10 trees knn_dist = [] knn_ind = [] eps = 1e-20 for i in range(n): #Get extra neighbors, in case there are mistakes A = u.get_nns_by_item(i, min(2*k,n), include_distances=True) ind = np.array(A[0]) #knn_dist.append(A[1]) #These distances are floating point (32-bit) precision #The code below computes them more accurately if similarity == 'euclidean': dist = np.linalg.norm(X[i,:] - X[ind,:],axis=1) elif similarity == 'angular': vi = X[i,:]/np.maximum(np.linalg.norm(X[i,:]),eps) vj = X[ind,:]/np.maximum(np.linalg.norm(X[ind,:],axis=1)[:,None],eps) dist = np.linalg.norm(vi-vj,axis=1) elif similarity == 'manhattan': dist = np.linalg.norm(X[i,:] - X[ind,:],axis=1,ord=1) elif similarity == 'hamming': dist = A[1] #hamming is integer-valued, so no need to compute in double precision elif similarity == 'dot': dist = np.sum(X[i,:]*X[ind,:],axis=1) else: dist = A[1] ind_sort = np.argsort(dist)[:k] ind = ind[ind_sort] dist = dist[ind_sort] #print(np.max(np.absolute(dist - np.array(A[1])))) knn_ind.append(ind) knn_dist.append(dist) knn_ind = np.array(knn_ind) knn_dist = np.array(knn_dist) else: sys.exit('Invalid choice of knnsearch method ' + method) #If dataset name is provided, save permutations to file if not dataset is None: #data file name dataFile = dataset.lower() + '_' + metric.lower() + '.npz' #Full path to file dataFile_path = os.path.join(knn_dir, dataFile) #Check if knn_dir exists if not os.path.exists(knn_dir): os.makedirs(knn_dir) np.savez_compressed(dataFile_path, J=knn_ind, D=knn_dist) return knn_ind, knn_dist
def load_knn_data(dataset, metric='raw')
-
Load saved knn data
Loads the results of a saved knn search.
Parameters
dataset
:string
- Name of dataset to load knn data for (not case-sensitive).
metric
:string (optional)
, default='raw'
- A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive).
Returns
knn_ind
:(n,k) numpy array, int
- Indices of nearest neighbors, including the self point.
knn_dist
:(n,k) numpy array, float
- Distances to all neighbors.
Expand source code
def load_knn_data(dataset, metric='raw'): """Load saved knn data ====== Loads the results of a saved knn search. Parameters ---------- dataset : string Name of dataset to load knn data for (not case-sensitive). metric : string (optional), default='raw' A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive). Returns ------- knn_ind : (n,k) numpy array, int Indices of nearest neighbors, including the self point. knn_dist : (n,k) numpy array, float Distances to all neighbors. """ dataFile = dataset.lower() + "_" + metric.lower() + ".npz" dataFile_path = os.path.join(knn_dir, dataFile) #Check if knn_dir exists if not os.path.exists(knn_dir): os.makedirs(knn_dir) #Download kNN data if necessary if not os.path.exists(dataFile_path): urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/kNNData/'+dataFile utils.download_file(urlpath, dataFile_path) knn_ind = utils.numpy_load(dataFile_path, 'J') knn_dist = utils.numpy_load(dataFile_path, 'D') return knn_ind, knn_dist
def vae(data, layer_widths=[400, 20], no_cuda=False, batch_size=128, epochs=100, learning_rate=0.001)
-
Variational Autoencoder Embedding
Embeds a dataset via a two layer variational autoencoder (VAE) latent representation. The VAE is essentially the original one from [1].
Parameters
data
:numpy array
- (n,d) array of n datapoints in dimension d.
layer_widths
:list
ofint, length=2 (optional)
, default=[400,20]
- First element is the width of the hidden layer, while the second is the dimension of the latent space.
no_cuda
:bool (optional)
, default=False
- Turn off GPU acceleration.
batch_size
:int (optional)
, default=128
- Batch size for gradient descent.
epochs
:int (optional)
, default=100
- How many epochs (loops over whole dataset) to train over.
learning_rate
:float (optional)
, default=1e-3
- Learning rate for optimizer.
Returns
data_vae
:numpy array
- Data encoded by the VAE.
Example
Using a VAE embedding to construct a similarity weight matrix on MNIST and running Poisson learning at 1 label per class: vae_mnist.py.
import graphlearning as gl data, labels = gl.datasets.load('mnist') data_vae = gl.weightmatrix.vae(data) W_raw = gl.weightmatrix.knn(data, 10) W_vae = gl.weightmatrix.knn(data_vae, 10) num_train_per_class = 1 train_ind = gl.trainsets.generate(labels, rate=num_train_per_class) train_labels = labels[train_ind] pred_labels_raw = gl.ssl.poisson(W_raw).fit_predict(train_ind,train_labels) pred_labels_vae = gl.ssl.poisson(W_vae).fit_predict(train_ind,train_labels) accuracy_raw = gl.ssl.ssl_accuracy(labels,pred_labels_raw,train_ind) accuracy_vae = gl.ssl.ssl_accuracy(labels,pred_labels_vae,train_ind) print('Raw Accuracy: %.2f%%'%accuracy_raw) print('VAE Accuracy: %.2f%%'%accuracy_vae)
References
[1] D.P. Kingma and M. Welling. Auto-encoding variational bayes. arXiv:1312.6114, 2014.
Expand source code
def vae(data, layer_widths=[400,20], no_cuda=False, batch_size=128, epochs=100, learning_rate=1e-3): """Variational Autoencoder Embedding ====== Embeds a dataset via a two layer variational autoencoder (VAE) latent representation. The VAE is essentially the original one from [1]. Parameters ---------- data : numpy array (n,d) array of n datapoints in dimension d. layer_widths : list of int, length=2 (optional), default=[400,20] First element is the width of the hidden layer, while the second is the dimension of the latent space. no_cuda : bool (optional), default=False Turn off GPU acceleration. batch_size : int (optional), default=128 Batch size for gradient descent. epochs : int (optional), default=100 How many epochs (loops over whole dataset) to train over. learning_rate : float (optional), default=1e-3 Learning rate for optimizer. Returns ------- data_vae : numpy array Data encoded by the VAE. Example ------- Using a VAE embedding to construct a similarity weight matrix on MNIST and running Poisson learning at 1 label per class: [vae_mnist.py](https://github.com/jwcalder/GraphLearning/blob/master/examples/vae_mnist.py). ```py import graphlearning as gl data, labels = gl.datasets.load('mnist') data_vae = gl.weightmatrix.vae(data) W_raw = gl.weightmatrix.knn(data, 10) W_vae = gl.weightmatrix.knn(data_vae, 10) num_train_per_class = 1 train_ind = gl.trainsets.generate(labels, rate=num_train_per_class) train_labels = labels[train_ind] pred_labels_raw = gl.ssl.poisson(W_raw).fit_predict(train_ind,train_labels) pred_labels_vae = gl.ssl.poisson(W_vae).fit_predict(train_ind,train_labels) accuracy_raw = gl.ssl.ssl_accuracy(labels,pred_labels_raw,train_ind) accuracy_vae = gl.ssl.ssl_accuracy(labels,pred_labels_vae,train_ind) print('Raw Accuracy: %.2f%%'%accuracy_raw) print('VAE Accuracy: %.2f%%'%accuracy_vae) ``` References ---------- [1] D.P. Kingma and M. Welling. [Auto-encoding variational bayes.](https://arxiv.org/abs/1312.6114) arXiv:1312.6114, 2014. """ import torch import torch.utils.data from torch.utils.data import Dataset, DataLoader from torch import nn, optim from torch.nn import functional as F from torchvision import datasets, transforms from torchvision.utils import save_image class MyDataset(Dataset): def __init__(self, data, targets, transform=None): self.data = data self.targets = targets self.transform = transform def __getitem__(self, index): x = self.data[index] y = self.targets[index] if self.transform: x = self.transform(x) return x, y def __len__(self): return len(self.data) class VAE(nn.Module): def __init__(self, layer_widths): super(VAE, self).__init__() self.lw = layer_widths self.fc1 = nn.Linear(self.lw[0], self.lw[1]) self.fc21 = nn.Linear(self.lw[1], self.lw[2]) self.fc22 = nn.Linear(self.lw[1], self.lw[2]) self.fc3 = nn.Linear(self.lw[2], self.lw[1]) self.fc4 = nn.Linear(self.lw[1], self.lw[0]) def encode(self, x): h1 = F.relu(self.fc1(x)) return self.fc21(h1), self.fc22(h1) def reparameterize(self, mu, logvar): std = torch.exp(0.5*logvar) eps = torch.randn_like(std) return mu + eps*std def decode(self, z): h3 = F.relu(self.fc3(z)) return torch.sigmoid(self.fc4(h3)) def forward(self, x): mu, logvar = self.encode(x.view(-1, self.lw[0])) z = self.reparameterize(mu, logvar) return self.decode(z), mu, logvar # Reconstruction + KL divergence losses summed over all elements and batch def loss_function(recon_x, x, mu, logvar): BCE = F.binary_cross_entropy(recon_x, x.view(-1, data.shape[1]), reduction='sum') # see Appendix B from VAE paper: # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 # https://arxiv.org/abs/1312.6114 # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) return BCE + KLD def train(epoch): model.train() train_loss = 0 for batch_idx, (data, _) in enumerate(data_loader): data = data.to(device) optimizer.zero_grad() recon_batch, mu, logvar = model(data) loss = loss_function(recon_batch, data, mu, logvar) loss.backward() train_loss += loss.item() optimizer.step() if batch_idx % log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(data_loader.dataset), 100. * batch_idx / len(data_loader), loss.item() / len(data))) print('====> Epoch: {} Average loss: {:.4f}'.format( epoch, train_loss / len(data_loader.dataset))) layer_widths = [data.shape[1]] + layer_widths log_interval = 10 #how many batches to wait before logging training status #GPU settings cuda = not no_cuda and torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {} #Convert to torch dataloaders data = data - data.min() data = data/data.max() data = torch.from_numpy(data).float() target = np.zeros((data.shape[0],)).astype(int) target = torch.from_numpy(target).long() dataset = MyDataset(data, target) data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs) #Put model on GPU and set up optimizer model = VAE(layer_widths).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) #Training epochs for epoch in range(1, epochs + 1): train(epoch) #Encode the dataset and save to npz file with torch.no_grad(): mu, logvar = model.encode(data.to(device).view(-1, layer_widths[0])) data_vae = mu.cpu().numpy() return data_vae