# Weight Matrices

This module implements functions that are useful for constructing sparse weight matrices, including efficient high dimensional nearest neighbor searches.

Expand source code
"""
Weight Matrices
==========

This module implements functions that are useful for constructing sparse weight matrices, including
efficient high dimensional nearest neighbor searches.
"""

import numpy as np
from scipy import spatial
from scipy import sparse
import os
from . import utils

#Directory to store knn data
knn_dir = os.path.abspath(os.path.join(os.getcwd(),'knn_data'))

def knn(data, k, kernel='gaussian', eta=None, symmetrize=True, metric='raw', similarity='euclidean', knn_data=None):
"""knn weight matrix
======

General function for constructing knn weight matrices.

Parameters
----------
data : (n,m) numpy array, or string
If numpy array, n data points, each of dimension m, if string, then 'mnist', 'fashionmnist', or 'cifar'
k : int
Number of nearest neighbors to use.
kernel : string (optional), {'uniform','gaussian','symgaussian','singular','distance'}, default='gaussian'
The choice of kernel in computing the weights between \$$x_i\$$ and each of its k
nearest neighbors. We let \$$d_k(x_i)\$$ denote the distance from \$$x_i\$$ to its kth
nearest neighbor. The choice 'uniform' corresponds to \$$w_{i,j}=1\$$ and constitutes
an unweighted k nearest neighbor graph, 'gaussian' corresponds to
\$w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{d_k(x_i)^2} \\right), \$
'symgaussian' corresponds to
\$w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{d_k(x_i)d_k(x_j)} \\right), \$
'distance' corresponds to
\$w_{i,j} = \\|x_i - x_j\\|, \$
and 'singular' corresponds to
\$w_{i,j} = \\frac{1}{\\|x_i - x_j\\|}, \$
when \$$i\\neq j\$$ and \$$w_{i,i}=1\$$.
eta : python function handle (optional)
If provided, this overrides the kernel option and instead uses the weights
\$w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{d_k(x_i)^2} \\right), \$
where \$$d_k(x_i)\$$ is the distance from \$$x_i\$$ to its kth nearest neighbor.
symmetrize : bool (optional), default=True, except when kernel='singular'
Whether or not to symmetrize the weight matrix before returning. Symmetrization is
performed by returning \$$(W + W^T)/2 \$$, except for when kernel='distance, in
which case the symmetrized edge weights are the true distances, kernel='uniform',
where the weights are all 0/1, or kernel='symgaussian', where the same formula
is used for symmetry. Default for symmetrization is True, unless the kernel is
'singular', in which case it is False.
metric : string (optional), default='raw'
Metric identifier if data is a string (i.e., a dataset).
similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean'
Smilarity for nearest neighbor search.
knn_data : tuple (optional), default=None
If desired, the user can provide knn_data = (knn_ind, knn_dist), the output of a knnsearch,
in order to bypass the knnsearch step, which can be slow for large datasets.

Returns
-------
W : (n,n) scipy sparse matrix, float
Sparse weight matrix.
"""

#If knn_data provided
if knn_data is not None:
knn_ind, knn_dist = knn_data

#If data is a string, then load knn from a stored dataset
elif type(data) is str:

#Else we have to run a knnsearch
else:
knn_ind, knn_dist = knnsearch(data, k, similarity=similarity)

#Restrict to k nearest neighbors
n = knn_ind.shape
k = np.minimum(knn_ind.shape,k)
knn_ind = knn_ind[:,:k]
knn_dist = knn_dist[:,:k]

#If eta is None, use kernel keyword
if eta is None:

if kernel == 'uniform':
weights = np.ones_like(knn_dist)
elif kernel == 'gaussian':
D = knn_dist*knn_dist
eps = D[:,k-1]
weights = np.exp(-4*D/eps[:,None])
elif kernel == 'symgaussian':
eps = knn_dist[:,k-1]
weights = np.exp(-4 * knn_dist * knn_dist / eps[:,None] / eps[knn_ind])
elif kernel == 'distance':
weights = knn_dist
elif kernel == 'singular':
weights = knn_dist
weights[knn_dist==0] = 1
weights = 1/weights
symmetrize = False
else:
sys.exit('Invalid choice of kernel: ' + kernel)

#Else use user-defined eta
else:
D = knn_dist*knn_dist
eps = D[:,k-1]
weights = eta(D/eps)

#Flatten knn data and weights
knn_ind = knn_ind.flatten()
weights = weights.flatten()

#Self indices
self_ind = np.ones((n,k))*np.arange(n)[:,None]
self_ind = self_ind.flatten()

#Construct sparse matrix and convert to Compressed Sparse Row (CSR) format
W = sparse.coo_matrix((weights, (self_ind, knn_ind)),shape=(n,n)).tocsr()

if symmetrize:
if kernel in ['distance','uniform']:
W = utils.sparse_max(W, W.transpose())
elif kernel == 'symgaussian':
W = W + W.T.multiply(W.T > W) - W.multiply(W.T > W)
else:
W = (W + W.transpose())/2;

return W

def epsilon_ball(data, epsilon, kernel='gaussian', features=None, epsilon_f=1, eta=None, zero_diagonal=False):
"""Epsilon ball weight matrix
======

General function for constructing a sparse epsilon-ball weight matrix, whose weights have the form
\$w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \$
when \$$\\|x_i - x_j\\|\\leq \\varepsilon\$$, and \$$w_{i,j}=0\$$ otherwise.
This type of weight matrix is only feasible in relatively low dimensions.

Parameters
----------
data : (n,m) numpy array
n data points, each of dimension m
epsilon : float
kernel : string (optional), {'uniform','gaussian','singular','distance'}, default='gaussian'
The choice of kernel in computing the weights between \$$x_i\$$ and \$$x_j\$$ when
\$$\\|x_i-x_j\\|\\leq \\varepsilon\$$. The choice 'uniform' corresponds to \$$w_{i,j}=1\$$
and constitutes an unweighted graph, 'gaussian' corresponds to
\$w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \$
'distance' corresponds to
\$w_{i,j} = \\|x_i - x_j\\|, \$
and 'singular' corresponds to
\$w_{i,j} = \\frac{1}{\\|x_i - x_j\\|}, \$
when \$$i\\neq j\$$ and \$$w_{i,i}=1\$$.
features : (n,k) numpy array (optional)
If provided, then the weights are additionally multiplied by the similarity in features, so that
\$w_{i,j} = \\eta\\left(\\frac{\\|y_i - y_j\\|^2}{\\varepsilon_F^2} \\right)\\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \$
when \$$\\|x_i - x_j\\|\\leq \\varepsilon\$$, and \$$w_{i,j}=0\$$ otherwise. The
vector \$$y_i\$$ is the feature vector associated with datapoint i. The features
are useful for building a similarity graph over an image for image segmentation, and
here the \$$y_i\$$ are either the pixel values at pixel i, or some other image feature
such as a texture indicator.
epsilon_f : float (optional).
Connectivity radius for features \$$\\varepsilon_F\$$. Default is \$$\\varepsilon_F=1\$$.
eta : python function handle (optional)
If provided, this overrides the kernel option and instead uses the weights
\$w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right). \$
zero_diagonal : bool (optional)
Whether to put zero on the diagonal, instead of \$$\\eta(0)\$$. Default is False.

Returns
-------
W : (n,n) scipy sparse matrix, float
Sparse weight matrix.
"""
n = data.shape  #Number of points

#Rangesearch to find nearest neighbors
Xtree = spatial.cKDTree(data)
M = Xtree.query_pairs(epsilon)
M = np.array(list(M))

#Differences between points and neighbors
V = data[M[:,0],:] - data[M[:,1],:]
dists = np.sum(V*V,axis=1)
weights, fzero = __weights__(dists,epsilon,kernel,eta)

if features is not None:
VF = features[M[:,0],:] - features[M[:,1],:]
Fdists = np.sum(VF*VF,axis=1)
feature_weights, _ = __weights__(Fdists,epsilon_f,kernel,eta)
weights = weights*feature_weights
fzero = fzero**2

#Symmetrize weights and add diagonal entries if they are not zero
if zero_diagonal:
weights = np.concatenate((weights,weights))
M1 = np.concatenate((M[:,0],M[:,1]))
M2 = np.concatenate((M[:,1],M[:,0]))
else:
weights = np.concatenate((weights,weights,fzero*np.ones(n,)))
M1 = np.concatenate((M[:,0],M[:,1],np.arange(0,n)))
M2 = np.concatenate((M[:,1],M[:,0],np.arange(0,n)))

#Construct sparse matrix and convert to Compressed Sparse Row (CSR) format
W = sparse.coo_matrix((weights, (M1,M2)),shape=(n,n))

return W.tocsr()

def __weights__(dists,epsilon,kernel,eta):

#If eta is None, use kernel keyword
if eta is None:

if kernel == 'uniform':
weights = np.ones_like(dists)
fzero = 1
elif kernel == 'gaussian':
weights = np.exp(-4*dists/(epsilon*epsilon))
fzero = 1
elif kernel == 'distance':
weights = np.sqrt(dists)
fzero = 0
elif kernel == 'singular':
weights = np.sqrt(dists)
weights[dists==0] = 1
weights = 1/weights
fzero = 1
else:
sys.exit('Invalid choice of kernel: ' + kernel)
#Else use user-defined eta
else:
weights = eta(dists/(epsilon*epsilon))
fzero = eta(0)

return weights, fzero

def knnsearch(X, k, method=None, similarity='euclidean', dataset=None, metric='raw'):
"""knn search
======

General function for k-nearest neighbor searching, including efficient
implementations for high dimensional data, and support for saving
k-nn data to files automatically, for reuse later.

Parameters
----------
X : (n,m) numpy array
n data points, each of dimension m.
k : int
Number of nearest neighbors to find.
method : {'kdtree','annoy','brute'} (optional), default: 'kdtree' for m <=5 and 'annoy' for m>5
Algorithm for search. Annoy is an approximate nearest neighbor search and requires
the [Annoy](https://github.com/spotify/annoy) package.
similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean'
Smilarity for nearest neighbor search. Only 'euclidean' and 'angular' are available with
'kdtree' and 'brute'.
dataset : string (optional), default=None
If provided, results of the search are saved to a file that can be loaded later.
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish different types of knn data.

Returns
-------
knn_ind : (n,k) numpy array, int
Indices of nearest neighbors, including the self point.
knn_dist : (n,k) numpy array, float
Distances to all neighbors.
"""

n = X.shape
m = X.shape
if method is None:
if m <= 5:
method = 'kdtree'
else:
method = 'annoy'

if method in ['kdtree','brute']:

if not similarity in ['angular','euclidean']:
sys.exit('Invalid choice of similarity ' + similarity)

if similarity == 'angular':
X /= np.linalg.norm(X,axis=1)[:,None]

if method == 'kdtree':

Xtree = spatial.cKDTree(X)
knn_dist, knn_ind = Xtree.query(X,k=k)

else: #Brute force knn search

knn_ind = np.array((n,k),dtype=int)
knn_dist = np.array((n,k))
for i in range(n):
dist  = np.linalg.norm(X - X[i,:],axis=1)
knn_ind[i,:] = np.argsort(dist)[:k]
knn_dist[i,:] = dist[knn_ind]

elif method == 'annoy':

if not similarity in ['euclidean','angular','manhattan','hamming','dot']:
sys.exit('Invalid choice of similarity ' + similarity)

from annoy import AnnoyIndex

u = AnnoyIndex(m, similarity)  # Length of item vector that will be indexed
for i in range(n):

u.build(10)  #10 trees

knn_dist = []
knn_ind = []
for i in range(n):
A = u.get_nns_by_item(i, k, include_distances=True, search_k=-1)
knn_ind.append(A)
knn_dist.append(A)

knn_ind = np.array(knn_ind)
knn_dist = np.array(knn_dist)

else:
sys.exit('Invalid choice of knnsearch method ' + method)

#If dataset name is provided, save permutations to file
if not dataset is None:
#data file name
dataFile = dataset.lower() + '_' + metric.lower() + '.npz'

#Full path to file
dataFile_path = os.path.join(knn_dir, dataFile)

#Check if knn_dir exists
if not os.path.exists(knn_dir):
os.makedirs(knn_dir)

np.savez_compressed(dataFile_path, J=knn_ind, D=knn_dist)

return knn_ind, knn_dist

======

Loads the results of a saved knn search.

Parameters
----------
dataset : string
Name of dataset to load knn data for (not case-sensitive).
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive).

Returns
-------
knn_ind : (n,k) numpy array, int
Indices of nearest neighbors, including the self point.
knn_dist : (n,k) numpy array, float
Distances to all neighbors.
"""

dataFile = dataset.lower() + "_" + metric.lower() + ".npz"
dataFile_path = os.path.join(knn_dir, dataFile)

#Check if knn_dir exists
if not os.path.exists(knn_dir):
os.makedirs(knn_dir)

if not os.path.exists(dataFile_path):
urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/kNNData/'+dataFile

return knn_ind, knn_dist

def vae(data, layer_widths=[400,20], no_cuda=False, batch_size=128, epochs=100, learning_rate=1e-3):
"""Variational Autoencoder Embedding
======

Embeds a dataset via a two layer variational autoencoder (VAE) latent representation. The VAE
is essentially the original one from .

Parameters
----------
data : numpy array
(n,d) array of n datapoints in dimension d.
layer_widths : list of int, length=2 (optional), default=[400,20]
First element is the width of the hidden layer, while the second is the dimension
of the latent space.
no_cuda : bool (optional), default=False
Turn off GPU acceleration.
batch_size : int (optional), default=128
epochs : int (optional), default=100
How many epochs (loops over whole dataset) to train over.
learning_rate : float (optional), default=1e-3
Learning rate for optimizer.

Returns
-------
data_vae : numpy array
Data encoded by the VAE.

Example
-------
Using a VAE embedding to construct a similarity weight matrix on MNIST and running Poisson learning
at 1 label per class: [vae_mnist.py](https://github.com/jwcalder/GraphLearning/blob/master/examples/vae_mnist.py).
py
import graphlearning as gl

data_vae = gl.weightmatrix.vae(data)

W_raw = gl.weightmatrix.knn(data, 10)
W_vae = gl.weightmatrix.knn(data_vae, 10)

num_train_per_class = 1
train_ind = gl.trainsets.generate(labels, rate=num_train_per_class)
train_labels = labels[train_ind]

pred_labels_raw = gl.ssl.poisson(W_raw).fit_predict(train_ind,train_labels)
pred_labels_vae = gl.ssl.poisson(W_vae).fit_predict(train_ind,train_labels)

accuracy_raw = gl.ssl.ssl_accuracy(labels,pred_labels_raw,train_ind)
accuracy_vae = gl.ssl.ssl_accuracy(labels,pred_labels_vae,train_ind)

print('Raw Accuracy: %.2f%%'%accuracy_raw)
print('VAE Accuracy: %.2f%%'%accuracy_vae)


References
----------
 D.P. Kingma and M. Welling. [Auto-encoding variational bayes.](https://arxiv.org/abs/1312.6114) arXiv:1312.6114, 2014.
"""

import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image

class MyDataset(Dataset):
def __init__(self, data, targets, transform=None):
self.data = data
self.targets = targets
self.transform = transform

def __getitem__(self, index):
x = self.data[index]
y = self.targets[index]

if self.transform:
x = self.transform(x)

return x, y

def __len__(self):
return len(self.data)

class VAE(nn.Module):
def __init__(self, layer_widths):
super(VAE, self).__init__()

self.lw = layer_widths
self.fc1 = nn.Linear(self.lw, self.lw)
self.fc21 = nn.Linear(self.lw, self.lw)
self.fc22 = nn.Linear(self.lw, self.lw)
self.fc3 = nn.Linear(self.lw, self.lw)
self.fc4 = nn.Linear(self.lw, self.lw)

def encode(self, x):
h1 = F.relu(self.fc1(x))
return self.fc21(h1), self.fc22(h1)

def reparameterize(self, mu, logvar):
std = torch.exp(0.5*logvar)
eps = torch.randn_like(std)
return mu + eps*std

def decode(self, z):
h3 = F.relu(self.fc3(z))

def forward(self, x):
mu, logvar = self.encode(x.view(-1, self.lw))
z = self.reparameterize(mu, logvar)
return self.decode(z), mu, logvar

# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(recon_x, x, mu, logvar):
BCE = F.binary_cross_entropy(recon_x, x.view(-1, data.shape), reduction='sum')

# see Appendix B from VAE paper:
# Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
# https://arxiv.org/abs/1312.6114
# 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

return BCE + KLD

def train(epoch):
model.train()
train_loss = 0
for batch_idx, (data, _) in enumerate(data_loader):
data = data.to(device)
recon_batch, mu, logvar = model(data)
loss = loss_function(recon_batch, data, mu, logvar)
loss.backward()
train_loss += loss.item()
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
loss.item() / len(data)))

print('====> Epoch: {} Average loss: {:.4f}'.format(

layer_widths = [data.shape] + layer_widths
log_interval = 10    #how many batches to wait before logging training status

#GPU settings
cuda = not no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}

data = data - data.min()
data = data/data.max()
data = torch.from_numpy(data).float()
target = np.zeros((data.shape,)).astype(int)
target = torch.from_numpy(target).long()
dataset = MyDataset(data, target)

#Put model on GPU and set up optimizer
model = VAE(layer_widths).to(device)

#Training epochs
for epoch in range(1, epochs + 1):
train(epoch)

#Encode the dataset and save to npz file
mu, logvar = model.encode(data.to(device).view(-1, layer_widths))
data_vae = mu.cpu().numpy()

return data_vae

## Functions

 def epsilon_ball(data, epsilon, kernel='gaussian', features=None, epsilon_f=1, eta=None, zero_diagonal=False) 

# Epsilon ball weight matrix

General function for constructing a sparse epsilon-ball weight matrix, whose weights have the form when $\|x_i - x_j\|\leq \varepsilon$, and $w_{i,j}=0$ otherwise. This type of weight matrix is only feasible in relatively low dimensions.

## Parameters

data : (n,m) numpy array
n data points, each of dimension m
epsilon : float
kernel : string (optional), {'uniform','gaussian','singular','distance'}, default='gaussian'
The choice of kernel in computing the weights between $x_i$ and $x_j$ when $\|x_i-x_j\|\leq \varepsilon$. The choice 'uniform' corresponds to $w_{i,j}=1$ and constitutes an unweighted graph, 'gaussian' corresponds to 'distance' corresponds to and 'singular' corresponds to when $i\neq j$ and $w_{i,i}=1$.
features : (n,k) numpy array (optional)
If provided, then the weights are additionally multiplied by the similarity in features, so that when $\|x_i - x_j\|\leq \varepsilon$, and $w_{i,j}=0$ otherwise. The vector $y_i$ is the feature vector associated with datapoint i. The features are useful for building a similarity graph over an image for image segmentation, and here the $y_i$ are either the pixel values at pixel i, or some other image feature such as a texture indicator.
epsilon_f : float (optional).
Connectivity radius for features $\varepsilon_F$. Default is $\varepsilon_F=1$.
eta : python function handle (optional)
If provided, this overrides the kernel option and instead uses the weights
zero_diagonal : bool (optional)
Whether to put zero on the diagonal, instead of $\eta(0)$. Default is False.

## Returns

W : (n,n) scipy sparse matrix, float
Sparse weight matrix.
Expand source code
def epsilon_ball(data, epsilon, kernel='gaussian', features=None, epsilon_f=1, eta=None, zero_diagonal=False):
"""Epsilon ball weight matrix
======

General function for constructing a sparse epsilon-ball weight matrix, whose weights have the form
\$w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \$
when \$$\\|x_i - x_j\\|\\leq \\varepsilon\$$, and \$$w_{i,j}=0\$$ otherwise.
This type of weight matrix is only feasible in relatively low dimensions.

Parameters
----------
data : (n,m) numpy array
n data points, each of dimension m
epsilon : float
kernel : string (optional), {'uniform','gaussian','singular','distance'}, default='gaussian'
The choice of kernel in computing the weights between \$$x_i\$$ and \$$x_j\$$ when
\$$\\|x_i-x_j\\|\\leq \\varepsilon\$$. The choice 'uniform' corresponds to \$$w_{i,j}=1\$$
and constitutes an unweighted graph, 'gaussian' corresponds to
\$w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \$
'distance' corresponds to
\$w_{i,j} = \\|x_i - x_j\\|, \$
and 'singular' corresponds to
\$w_{i,j} = \\frac{1}{\\|x_i - x_j\\|}, \$
when \$$i\\neq j\$$ and \$$w_{i,i}=1\$$.
features : (n,k) numpy array (optional)
If provided, then the weights are additionally multiplied by the similarity in features, so that
\$w_{i,j} = \\eta\\left(\\frac{\\|y_i - y_j\\|^2}{\\varepsilon_F^2} \\right)\\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right), \$
when \$$\\|x_i - x_j\\|\\leq \\varepsilon\$$, and \$$w_{i,j}=0\$$ otherwise. The
vector \$$y_i\$$ is the feature vector associated with datapoint i. The features
are useful for building a similarity graph over an image for image segmentation, and
here the \$$y_i\$$ are either the pixel values at pixel i, or some other image feature
such as a texture indicator.
epsilon_f : float (optional).
Connectivity radius for features \$$\\varepsilon_F\$$. Default is \$$\\varepsilon_F=1\$$.
eta : python function handle (optional)
If provided, this overrides the kernel option and instead uses the weights
\$w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{\\varepsilon^2} \\right). \$
zero_diagonal : bool (optional)
Whether to put zero on the diagonal, instead of \$$\\eta(0)\$$. Default is False.

Returns
-------
W : (n,n) scipy sparse matrix, float
Sparse weight matrix.
"""
n = data.shape  #Number of points

#Rangesearch to find nearest neighbors
Xtree = spatial.cKDTree(data)
M = Xtree.query_pairs(epsilon)
M = np.array(list(M))

#Differences between points and neighbors
V = data[M[:,0],:] - data[M[:,1],:]
dists = np.sum(V*V,axis=1)
weights, fzero = __weights__(dists,epsilon,kernel,eta)

if features is not None:
VF = features[M[:,0],:] - features[M[:,1],:]
Fdists = np.sum(VF*VF,axis=1)
feature_weights, _ = __weights__(Fdists,epsilon_f,kernel,eta)
weights = weights*feature_weights
fzero = fzero**2

#Symmetrize weights and add diagonal entries if they are not zero
if zero_diagonal:
weights = np.concatenate((weights,weights))
M1 = np.concatenate((M[:,0],M[:,1]))
M2 = np.concatenate((M[:,1],M[:,0]))
else:
weights = np.concatenate((weights,weights,fzero*np.ones(n,)))
M1 = np.concatenate((M[:,0],M[:,1],np.arange(0,n)))
M2 = np.concatenate((M[:,1],M[:,0],np.arange(0,n)))

#Construct sparse matrix and convert to Compressed Sparse Row (CSR) format
W = sparse.coo_matrix((weights, (M1,M2)),shape=(n,n))

return W.tocsr()
 def knn(data, k, kernel='gaussian', eta=None, symmetrize=True, metric='raw', similarity='euclidean', knn_data=None) 

# knn weight matrix

General function for constructing knn weight matrices.

## Parameters

data : (n,m) numpy array, or string
If numpy array, n data points, each of dimension m, if string, then 'mnist', 'fashionmnist', or 'cifar'
k : int
Number of nearest neighbors to use.
kernel : string (optional), {'uniform','gaussian','symgaussian','singular','distance'}, default='gaussian'
The choice of kernel in computing the weights between $x_i$ and each of its k nearest neighbors. We let $d_k(x_i)$ denote the distance from $x_i$ to its kth nearest neighbor. The choice 'uniform' corresponds to $w_{i,j}=1$ and constitutes an unweighted k nearest neighbor graph, 'gaussian' corresponds to 'symgaussian' corresponds to 'distance' corresponds to and 'singular' corresponds to when $i\neq j$ and $w_{i,i}=1$.
eta : python function handle (optional)
If provided, this overrides the kernel option and instead uses the weights where $d_k(x_i)$ is the distance from $x_i$ to its kth nearest neighbor.
symmetrize : bool (optional), default=True, except when kernel='singular'
Whether or not to symmetrize the weight matrix before returning. Symmetrization is performed by returning $(W + W^T)/2$, except for when kernel='distance, in which case the symmetrized edge weights are the true distances, kernel='uniform', where the weights are all 0/1, or kernel='symgaussian', where the same formula is used for symmetry. Default for symmetrization is True, unless the kernel is 'singular', in which case it is False.
metric : string (optional), default='raw'
Metric identifier if data is a string (i.e., a dataset).
similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean'
Smilarity for nearest neighbor search.
knn_data : tuple (optional), default=None
If desired, the user can provide knn_data = (knn_ind, knn_dist), the output of a knnsearch, in order to bypass the knnsearch step, which can be slow for large datasets.

## Returns

W : (n,n) scipy sparse matrix, float
Sparse weight matrix.
Expand source code
def knn(data, k, kernel='gaussian', eta=None, symmetrize=True, metric='raw', similarity='euclidean', knn_data=None):
"""knn weight matrix
======

General function for constructing knn weight matrices.

Parameters
----------
data : (n,m) numpy array, or string
If numpy array, n data points, each of dimension m, if string, then 'mnist', 'fashionmnist', or 'cifar'
k : int
Number of nearest neighbors to use.
kernel : string (optional), {'uniform','gaussian','symgaussian','singular','distance'}, default='gaussian'
The choice of kernel in computing the weights between \$$x_i\$$ and each of its k
nearest neighbors. We let \$$d_k(x_i)\$$ denote the distance from \$$x_i\$$ to its kth
nearest neighbor. The choice 'uniform' corresponds to \$$w_{i,j}=1\$$ and constitutes
an unweighted k nearest neighbor graph, 'gaussian' corresponds to
\$w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{d_k(x_i)^2} \\right), \$
'symgaussian' corresponds to
\$w_{i,j} = \\exp\\left(\\frac{-4\\|x_i - x_j\\|^2}{d_k(x_i)d_k(x_j)} \\right), \$
'distance' corresponds to
\$w_{i,j} = \\|x_i - x_j\\|, \$
and 'singular' corresponds to
\$w_{i,j} = \\frac{1}{\\|x_i - x_j\\|}, \$
when \$$i\\neq j\$$ and \$$w_{i,i}=1\$$.
eta : python function handle (optional)
If provided, this overrides the kernel option and instead uses the weights
\$w_{i,j} = \\eta\\left(\\frac{\\|x_i - x_j\\|^2}{d_k(x_i)^2} \\right), \$
where \$$d_k(x_i)\$$ is the distance from \$$x_i\$$ to its kth nearest neighbor.
symmetrize : bool (optional), default=True, except when kernel='singular'
Whether or not to symmetrize the weight matrix before returning. Symmetrization is
performed by returning \$$(W + W^T)/2 \$$, except for when kernel='distance, in
which case the symmetrized edge weights are the true distances, kernel='uniform',
where the weights are all 0/1, or kernel='symgaussian', where the same formula
is used for symmetry. Default for symmetrization is True, unless the kernel is
'singular', in which case it is False.
metric : string (optional), default='raw'
Metric identifier if data is a string (i.e., a dataset).
similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean'
Smilarity for nearest neighbor search.
knn_data : tuple (optional), default=None
If desired, the user can provide knn_data = (knn_ind, knn_dist), the output of a knnsearch,
in order to bypass the knnsearch step, which can be slow for large datasets.

Returns
-------
W : (n,n) scipy sparse matrix, float
Sparse weight matrix.
"""

#If knn_data provided
if knn_data is not None:
knn_ind, knn_dist = knn_data

#If data is a string, then load knn from a stored dataset
elif type(data) is str:

#Else we have to run a knnsearch
else:
knn_ind, knn_dist = knnsearch(data, k, similarity=similarity)

#Restrict to k nearest neighbors
n = knn_ind.shape
k = np.minimum(knn_ind.shape,k)
knn_ind = knn_ind[:,:k]
knn_dist = knn_dist[:,:k]

#If eta is None, use kernel keyword
if eta is None:

if kernel == 'uniform':
weights = np.ones_like(knn_dist)
elif kernel == 'gaussian':
D = knn_dist*knn_dist
eps = D[:,k-1]
weights = np.exp(-4*D/eps[:,None])
elif kernel == 'symgaussian':
eps = knn_dist[:,k-1]
weights = np.exp(-4 * knn_dist * knn_dist / eps[:,None] / eps[knn_ind])
elif kernel == 'distance':
weights = knn_dist
elif kernel == 'singular':
weights = knn_dist
weights[knn_dist==0] = 1
weights = 1/weights
symmetrize = False
else:
sys.exit('Invalid choice of kernel: ' + kernel)

#Else use user-defined eta
else:
D = knn_dist*knn_dist
eps = D[:,k-1]
weights = eta(D/eps)

#Flatten knn data and weights
knn_ind = knn_ind.flatten()
weights = weights.flatten()

#Self indices
self_ind = np.ones((n,k))*np.arange(n)[:,None]
self_ind = self_ind.flatten()

#Construct sparse matrix and convert to Compressed Sparse Row (CSR) format
W = sparse.coo_matrix((weights, (self_ind, knn_ind)),shape=(n,n)).tocsr()

if symmetrize:
if kernel in ['distance','uniform']:
W = utils.sparse_max(W, W.transpose())
elif kernel == 'symgaussian':
W = W + W.T.multiply(W.T > W) - W.multiply(W.T > W)
else:
W = (W + W.transpose())/2;

return W
 def knnsearch(X, k, method=None, similarity='euclidean', dataset=None, metric='raw') 

# knn search

General function for k-nearest neighbor searching, including efficient implementations for high dimensional data, and support for saving k-nn data to files automatically, for reuse later.

## Parameters

X : (n,m) numpy array
n data points, each of dimension m.
k : int
Number of nearest neighbors to find.
method : {'kdtree','annoy','brute'} (optional), default: 'kdtree' for m <=5 and 'annoy' for m>5
Algorithm for search. Annoy is an approximate nearest neighbor search and requires the Annoy package.
similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean'
Smilarity for nearest neighbor search. Only 'euclidean' and 'angular' are available with 'kdtree' and 'brute'.
dataset : string (optional), default=None
If provided, results of the search are saved to a file that can be loaded later.
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish different types of knn data.

## Returns

knn_ind : (n,k) numpy array, int
Indices of nearest neighbors, including the self point.
knn_dist : (n,k) numpy array, float
Distances to all neighbors.
Expand source code
def knnsearch(X, k, method=None, similarity='euclidean', dataset=None, metric='raw'):
"""knn search
======

General function for k-nearest neighbor searching, including efficient
implementations for high dimensional data, and support for saving
k-nn data to files automatically, for reuse later.

Parameters
----------
X : (n,m) numpy array
n data points, each of dimension m.
k : int
Number of nearest neighbors to find.
method : {'kdtree','annoy','brute'} (optional), default: 'kdtree' for m <=5 and 'annoy' for m>5
Algorithm for search. Annoy is an approximate nearest neighbor search and requires
the [Annoy](https://github.com/spotify/annoy) package.
similarity : {'euclidean','angular','manhattan','hamming','dot'} (optional), default='euclidean'
Smilarity for nearest neighbor search. Only 'euclidean' and 'angular' are available with
'kdtree' and 'brute'.
dataset : string (optional), default=None
If provided, results of the search are saved to a file that can be loaded later.
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish different types of knn data.

Returns
-------
knn_ind : (n,k) numpy array, int
Indices of nearest neighbors, including the self point.
knn_dist : (n,k) numpy array, float
Distances to all neighbors.
"""

n = X.shape
m = X.shape
if method is None:
if m <= 5:
method = 'kdtree'
else:
method = 'annoy'

if method in ['kdtree','brute']:

if not similarity in ['angular','euclidean']:
sys.exit('Invalid choice of similarity ' + similarity)

if similarity == 'angular':
X /= np.linalg.norm(X,axis=1)[:,None]

if method == 'kdtree':

Xtree = spatial.cKDTree(X)
knn_dist, knn_ind = Xtree.query(X,k=k)

else: #Brute force knn search

knn_ind = np.array((n,k),dtype=int)
knn_dist = np.array((n,k))
for i in range(n):
dist  = np.linalg.norm(X - X[i,:],axis=1)
knn_ind[i,:] = np.argsort(dist)[:k]
knn_dist[i,:] = dist[knn_ind]

elif method == 'annoy':

if not similarity in ['euclidean','angular','manhattan','hamming','dot']:
sys.exit('Invalid choice of similarity ' + similarity)

from annoy import AnnoyIndex

u = AnnoyIndex(m, similarity)  # Length of item vector that will be indexed
for i in range(n):

u.build(10)  #10 trees

knn_dist = []
knn_ind = []
for i in range(n):
A = u.get_nns_by_item(i, k, include_distances=True, search_k=-1)
knn_ind.append(A)
knn_dist.append(A)

knn_ind = np.array(knn_ind)
knn_dist = np.array(knn_dist)

else:
sys.exit('Invalid choice of knnsearch method ' + method)

#If dataset name is provided, save permutations to file
if not dataset is None:
#data file name
dataFile = dataset.lower() + '_' + metric.lower() + '.npz'

#Full path to file
dataFile_path = os.path.join(knn_dir, dataFile)

#Check if knn_dir exists
if not os.path.exists(knn_dir):
os.makedirs(knn_dir)

np.savez_compressed(dataFile_path, J=knn_ind, D=knn_dist)

return knn_ind, knn_dist
 def load_knn_data(dataset, metric='raw') 

Loads the results of a saved knn search.

## Parameters

dataset : string
Name of dataset to load knn data for (not case-sensitive).
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive).

## Returns

knn_ind : (n,k) numpy array, int
Indices of nearest neighbors, including the self point.
knn_dist : (n,k) numpy array, float
Distances to all neighbors.
Expand source code
def load_knn_data(dataset, metric='raw'):
======

Loads the results of a saved knn search.

Parameters
----------
dataset : string
Name of dataset to load knn data for (not case-sensitive).
metric : string (optional), default='raw'
A modifier to add to the dataset name when saving, to distinguish different types of knn data (not case-sensitive).

Returns
-------
knn_ind : (n,k) numpy array, int
Indices of nearest neighbors, including the self point.
knn_dist : (n,k) numpy array, float
Distances to all neighbors.
"""

dataFile = dataset.lower() + "_" + metric.lower() + ".npz"
dataFile_path = os.path.join(knn_dir, dataFile)

#Check if knn_dir exists
if not os.path.exists(knn_dir):
os.makedirs(knn_dir)

if not os.path.exists(dataFile_path):
urlpath = 'https://github.com/jwcalder/GraphLearning/raw/master/kNNData/'+dataFile

return knn_ind, knn_dist
 def vae(data, layer_widths=[400, 20], no_cuda=False, batch_size=128, epochs=100, learning_rate=0.001) 

# Variational Autoencoder Embedding

Embeds a dataset via a two layer variational autoencoder (VAE) latent representation. The VAE is essentially the original one from .

## Parameters

data : numpy array
(n,d) array of n datapoints in dimension d.
layer_widths : list of int, length=2 (optional), default=[400,20]
First element is the width of the hidden layer, while the second is the dimension of the latent space.
no_cuda : bool (optional), default=False
Turn off GPU acceleration.
batch_size : int (optional), default=128
epochs : int (optional), default=100
How many epochs (loops over whole dataset) to train over.
learning_rate : float (optional), default=1e-3
Learning rate for optimizer.

## Returns

data_vae : numpy array
Data encoded by the VAE.

## Example

Using a VAE embedding to construct a similarity weight matrix on MNIST and running Poisson learning at 1 label per class: vae_mnist.py.

import graphlearning as gl

data_vae = gl.weightmatrix.vae(data)

W_raw = gl.weightmatrix.knn(data, 10)
W_vae = gl.weightmatrix.knn(data_vae, 10)

num_train_per_class = 1
train_ind = gl.trainsets.generate(labels, rate=num_train_per_class)
train_labels = labels[train_ind]

pred_labels_raw = gl.ssl.poisson(W_raw).fit_predict(train_ind,train_labels)
pred_labels_vae = gl.ssl.poisson(W_vae).fit_predict(train_ind,train_labels)

accuracy_raw = gl.ssl.ssl_accuracy(labels,pred_labels_raw,train_ind)
accuracy_vae = gl.ssl.ssl_accuracy(labels,pred_labels_vae,train_ind)

print('Raw Accuracy: %.2f%%'%accuracy_raw)
print('VAE Accuracy: %.2f%%'%accuracy_vae)


 D.P. Kingma and M. Welling. Auto-encoding variational bayes. arXiv:1312.6114, 2014.