Source code for ot.datasets

"""
Simple example datasets
"""

# Author: Remi Flamary <remi.flamary@unice.fr>
#
# License: MIT License

import numpy as np
import scipy as sp
from scipy.stats import ortho_group, multivariate_normal
from .utils import check_random_state, deprecated


[docs] def make_1D_gauss(n, m, s): """return a 1D histogram for a gaussian distribution (`n` bins, mean `m` and std `s`) Parameters ---------- n : int number of bins in the histogram m : float mean value of the gaussian distribution s : float standard deviation of the gaussian distribution Returns ------- h : ndarray (`n`,) 1D histogram for a gaussian distribution """ x = np.arange(n, dtype=np.float64) h = np.exp(-((x - m) ** 2) / (2 * s**2)) return h / h.sum()
@deprecated() def get_1D_gauss(n, m, sigma): """Deprecated see make_1D_gauss""" return make_1D_gauss(n, m, sigma)
[docs] def make_2D_samples_gauss(n, m, sigma, random_state=None): r"""Return `n` samples drawn from 2D gaussian :math:`\mathcal{N}(m, \sigma)` Parameters ---------- n : int number of samples to make m : ndarray, shape (2,) mean value of the gaussian distribution sigma : ndarray, shape (2, 2) covariance matrix of the gaussian distribution random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- X : ndarray, shape (`n`, 2) n samples drawn from :math:`\mathcal{N}(m, \sigma)`. """ generator = check_random_state(random_state) if np.isscalar(sigma): sigma = np.array( [ sigma, ] ) if len(sigma) > 1: P = sp.linalg.sqrtm(sigma) res = generator.randn(n, 2).dot(P) + m else: res = generator.randn(n, 2) * np.sqrt(sigma) + m return res
@deprecated() def get_2D_samples_gauss(n, m, sigma, random_state=None): """Deprecated see make_2D_samples_gauss""" return make_2D_samples_gauss(n, m, sigma, random_state=None)
[docs] def make_data_classif(dataset, n, nz=0.5, theta=0, p=0.5, random_state=None, **kwargs): """Dataset generation for classification problems Parameters ---------- dataset : str type of classification problem (see code) n : int number of training samples nz : float noise level (>0) p : float proportion of one class in the binary setting random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- X : ndarray, shape (n, d) `n` observation of size `d` y : ndarray, shape (n,) labels of the samples. """ generator = check_random_state(random_state) if dataset.lower() == "3gauss": y = np.floor((np.arange(n) * 1.0 / n * 3)) + 1 x = np.zeros((n, 2)) # class 1 x[y == 1, 0] = -1.0 x[y == 1, 1] = -1.0 x[y == 2, 0] = -1.0 x[y == 2, 1] = 1.0 x[y == 3, 0] = 1.0 x[y == 3, 1] = 0 x[y != 3, :] += 1.5 * nz * generator.randn(sum(y != 3), 2) x[y == 3, :] += 2 * nz * generator.randn(sum(y == 3), 2) elif dataset.lower() == "3gauss2": y = np.floor((np.arange(n) * 1.0 / n * 3)) + 1 x = np.zeros((n, 2)) y[y == 4] = 3 # class 1 x[y == 1, 0] = -2.0 x[y == 1, 1] = -2.0 x[y == 2, 0] = -2.0 x[y == 2, 1] = 2.0 x[y == 3, 0] = 2.0 x[y == 3, 1] = 0 x[y != 3, :] += nz * generator.randn(sum(y != 3), 2) x[y == 3, :] += 2 * nz * generator.randn(sum(y == 3), 2) elif dataset.lower() == "gaussrot": rot = np.array( [[np.cos(theta), np.sin(theta)], [-np.sin(theta), np.cos(theta)]] ) m1 = np.array([-1, 1]) m2 = np.array([1, -1]) y = np.floor((np.arange(n) * 1.0 / n * 2)) + 1 n1 = np.sum(y == 1) n2 = np.sum(y == 2) x = np.zeros((n, 2)) x[y == 1, :] = make_2D_samples_gauss(n1, m1, nz, random_state=generator) x[y == 2, :] = make_2D_samples_gauss(n2, m2, nz, random_state=generator) x = x.dot(rot) elif dataset.lower() == "2gauss_prop": y = np.concatenate((np.ones(int(p * n)), np.zeros(int((1 - p) * n)))) x = np.hstack((0 * y[:, None] - 0, 1 - 2 * y[:, None])) + nz * generator.randn( len(y), 2 ) if ("bias" not in kwargs) and ("b" not in kwargs): kwargs["bias"] = np.array([0, 2]) x[:, 0] += kwargs["bias"][0] x[:, 1] += kwargs["bias"][1] else: x = np.array(0) y = np.array(0) print("unknown dataset") return x, y.astype(int)
@deprecated() def get_data_classif(dataset, n, nz=0.5, theta=0, random_state=None, **kwargs): """Deprecated see make_data_classif""" return make_data_classif(dataset, n, nz=0.5, theta=0, random_state=None, **kwargs)
[docs] def make_gauss_hd( ns, nt, p=100, dim=5, m_diff=3.0, a=(10.0, 15.0), b=(3.0, 3.0), sub_the_same=False ): """Generation of source and target domains from Gaussian HD distributions Parameters ---------- ns : int number of samples (source) nt : int number of samples (target) p : int dimension of the ambient space the data live in dim : (int,int) or int the intrinsic dimensions of the source and target Gaussian HD distriutions. If a single int the intrinsic dimension is assumed to be the same m_diff : float the shift in the first coordinate of the means of the Gaussian HD distributions, i.e. ms_0 and mt_0, respectively (see code) a : (float, float) positive floating numbers corresponding to the isotropic variances in the principal subspace, for the source and target distributions, respectively. The same as \delta in :ref:`[1] <references-make_gauss-hd>`, Proposition 2.2 b : (float, float) positive floating numbers corresponding to the isotropic variance outside the principal subspace for the source and target distributions, respectively. sub_the_same : bool should the source/target Gaussian HD distributions live in the same principal subspace? Returns ------- Xs : ndarray, shape (ns, p) `ns` observations of size `p` (source) Xt : ndarray, shape (nt, p) `nt` observations of size `p` (destination) pmts : list a list containing the parameters of the Gaussian HD distributions .. _references-make_gauss_hd: References ---------- .. [1] Bouveyron, C. & Corneli, M. ("Scaling Optimal Transport to High-Dimensional Gaussian Distributions") """ d = (dim, dim) if isinstance(dim, int) else dim mu = np.zeros((2, p)) S = [] mu[1, 0] = m_diff Q = [ortho_group.rvs(p) for _ in range(2)] if sub_the_same: Q[1] = Q[0] S.append( Q[0] @ np.diag(np.hstack((np.full(d[0], a[0]), np.full(p - d[0], b[0])))) @ Q[0].T ) S.append( Q[1] @ np.diag(np.hstack((np.full(d[1], a[1]), np.full(p - d[1], b[1])))) @ Q[1].T ) Xs = multivariate_normal.rvs(mean=mu[0], cov=S[0], size=ns) Xt = multivariate_normal.rvs(mean=mu[1], cov=S[1], size=ns) ms = mu[0] mt = mu[1] ds = d[0] dt = d[1] sigma2_s = np.array(b[0]) sigma2_t = np.array(b[1]) ls = np.repeat(a[0], ds) - sigma2_s lt = np.repeat(a[1], dt) - sigma2_t Us = Q[0][:, :ds] Ut = Q[1][:, :dt] ds = np.array([ds]) dt = np.array([dt]) prmts = { "ms": ms, "mt": mt, "sigma2_s": sigma2_s, "sigma2_t": sigma2_t, "ls": ls, "lt": lt, "Us": Us, "Ut": Ut, "ds": ds, "dt": dt, "Cs": S[0], "Ct": S[1], } return Xs, Xt, prmts