# Author: Julien Mairal <julien.mairal@inria.fr>
#
# License: BSD 3 clause
from abc import abstractmethod
import numpy as np
import scipy.sparse
import cyanure_wrap
def preprocess(X, centering=False, normalize=True, columns=False):
"""Perform in-place centering or normalization, either of columns or rows
of the input matrix X
Parameters
----------
X : numpy array, or scipy sparse CSR matrix
input matrix
centering : boolean, default=False
perform a centering operation
normalize : boolean, default=True
l2-normalization
columns : boolean, default=False
operates on rows (False) or columns (True).
"""
if scipy.sparse.issparse(X):
Xf = X.T
else:
Xf = np.asfortranarray(X.T)
return cyanure_wrap.preprocess_(Xf, centering, normalize, not columns)
class ERM:
"""The generic class for empirical risk minimization problems.
For univariates problems, minimizes
min_{w,b} (1/n) sum_{i=1}^n L( y_i, <w, x_i> + b) + psi(w)
"""
def __init__(self, loss='square', penalty='l2', fit_intercept=False):
"""Initialization function of the ERM class.
Parameters
----------
loss : string, default='square'
Loss function to be used. Possible choices are
- 'square' => L(y,z) = 0.5 ( y-z)^2
- 'logistic' => L(y,z) = log(1 + e^{-y z} )
- 'sqhinge' or 'squared_hinge' => L(y,z) = 0.5 max( 0, 1- y z)^2
- 'safe-logistic' => L(y,z) = e^{ yz - 1 } - y z if yz <= 1
and 0 otherwise
- 'multiclass-logistic' => multinomial logistic (see Latex
documentation).
Note that for binary classification, we assume the labels to be of
the form {-1,+1}
penalty: string, default='none'
Regularization function psi. Possible choices are
For univariate problems
- 'none' => psi(w) = 0
- 'l2' => psi{w) = (lambd/2) ||w||_2^2
- 'l1' => psi{w) = lambd ||w||_1
- 'elastic-net' => psi{w) = lambd ||w||_1 + (lambd2/2)||w||_2^2
- 'fused-lasso' => psi(w) = lambd sum_{i=2}^p |w[i]-w[i-1]|
+ lambd2||w||_1 + (lambd3/2)||w||_2^2
- 'l1-ball' => encodes the constraint ||w||_1 <= lambd
- 'l2-ball' => encodes the constraint ||w||_2 <= lambd
For multivariate problems, the previous penalties operate on each
individual (e.g., class) predictor.
In addition, multitask-group Lasso penalties are provided for
multivariate problems (w is then a matrix)
- 'l1l2' or 'l1linf', see Latex documentation
fit_intercept: boolean, default='False'
learns an unregularized intercept b (or several intercepts for
multivariate problems)
"""
self.loss = loss
if (loss == 'squared_hinge'):
self.loss = 'sqhinge'
self.penalty = penalty
self.fit_intercept = fit_intercept
self.w = 0
self.b = 0
self.dual_variable = None
def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
restart=False, univariate=True, nthreads=-1, seed=0):
"""
The fitting function (the one that does the job)
Parameters
----------
X : numpy array, or scipy sparse CSR matrix
input n x p numpy matrix; the samples are on the rows
y : labels, numpy array.
- vector of size n with real values for regression
- vector of size n with {-1,+1} labels for binary classification,
which will be automatically converted if labels in {0,1} are
provided
- matrix of size n x k for multivariate regression
- vector of size n with entries in {0,1,k-1} for classification
with k classes
lambd: float, default=0
first regularization parameter
lambd2: float, default=0
second regularization parameter, if needed
lambd3: float, default=0
third regularization parameter, if needed
solver: string, default='auto'
Optimization solver. Possible choices are
- 'ista'
- 'fista'
- 'catalyst-ista'
- 'qning-ista' (proximal quasi-Newton method)
- 'svrg'
- 'catalyst-svrg' (accelerated SVRG with Catalyst)
- 'qning-svrg' (quasi-Newton SVRG)
- 'acc-svrg' (SVRG with direct acceleration)
- 'miso'
- 'catalyst-miso' (accelerated MISO with Catalyst)
- 'qning-miso' (quasi-Newton MISO)
- 'auto'
see the Latex documentation for more details.
If you are unsure, use 'auto'
tol: float, default='1e-3'
Tolerance parameter. For almost all combinations of loss and
penalty functions, this parameter is based on a duality gap.
Assuming the (non-negative) objective function is "f" and its
optimal value is "f^*", the algorithm stops with the guarantee
f(x_t) - f^* <= tol f(x_t)
max_epochs: int, default=500
Maximum number of iteration of the algorithm in terms of passes
over the data
it0: int, default=10
Frequency of duality-gap computation
verbose: boolean, default=True
Display information or not
nthreads: int, default=-1
maximum number of cores the method may use (-1 = all cores).
Note that more cores is not always better.
seed: int, default=0
random seed
restart: boolean, default=False
use a restart strategy (useful for computing regularization path)
univariate: boolean, default=True
univariate or multivariate problems
l_qning: int, default=20
memory paramter for the qning method
f_restart: int, default=50
restart strategy for fista
Returns
-------
test returns a numpy array carrying information about the optimization
process (number of iterations, objective function values, duality gap)
will be documented in the future if people ask me,
"""
if y.ndim == 0:
print("Please provide label vector")
return
Xf = X.T if scipy.sparse.issparse(X) else np.asfortranarray(X.T)
p = X.shape[1]+1 if self.fit_intercept else X.shape[1]
y = np.squeeze(y)
if y.shape[0] != X.shape[0]:
print("X and y should have the same number of observations")
return
if univariate:
w0 = np.zeros(p, dtype=Xf.dtype)
yf = np.squeeze(y.astype(Xf.dtype))
else:
if y.squeeze().ndim > 1:
nclasses = y.squeeze().shape[1]
yf = np.asfortranarray(y.T)
else:
nclasses = int(np.max(y)+1)
yf = np.squeeze(np.int32(y))
w0 = np.zeros([p, nclasses], dtype=Xf.dtype, order='F')
if restart and np.any(self.w != 0):
print("Restart")
if self.fit_intercept:
w0[-1, ] = self.b
w0[0:-1, ] = self.w
else:
w0 = self.w
if restart and (solver == 'auto' or solver == 'miso' or
solver == 'catalyst-miso' or solver == 'qning-miso'):
n = X.shape[0]
reset_dual = np.any(self.dual_variable is None)
if not reset_dual and univariate:
reset_dual = self.dual_variable.shape[0] != n
if not reset_dual and not univariate:
reset_dual = np.any(self.dual_variable.shape != [n, nclasses])
if reset_dual and univariate:
self.dual_variable = np.zeros(n, dtype=Xf.dtype)
if reset_dual and not univariate:
self.dual_variable = np.zeros([n, nclasses], dtype=Xf.dtype)
w = np.copy(w0)
optim_info = cyanure_wrap.erm_(
Xf, yf, w0, w, dual_variable=self.dual_variable, loss=self.loss,
penalty=self.penalty, solver=solver, lambd=float(lambd),
lambd2=float(lambd2), lambd3=float(lambd3),
intercept=bool(self.fit_intercept), tol=float(tol), it0=int(it0),
nepochs=int(max_epochs), l_qning=int(l_qning),
f_restart=int(f_restart), verbose=bool(verbose),
univariate=bool(univariate), nthreads=int(nthreads), seed=int(seed)
)
if self.fit_intercept:
self.b = w[-1, ]
self.w = w[0:-1, ]
else:
self.w = w
return optim_info
@abstractmethod
def predict(self, X):
"""
predict the labels given an input matrix X (same format as fit)
"""
pass
def get_weights(self):
"""
get the model parameters (either w or the tuple (w,b))
"""
return (self.w, self.b) if self.fit_intercept else self.w
def eval(self, X, y, lambd=0, lambd2=0, lambd3=0):
"""
get the value of the objective function and computes a relative
duality gap, see function fit for the format of parameters.
"""
return self.fit(X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3,
max_epochs=0, verbose=False, restart=True)
[docs]class BinaryClassifier(ERM):
"""
The binary classification class, which derives from ERM. The goal is to
minimize the following objective
.. math::
\\min_{w,b} \\frac{1}{n} \\sum_{i=1}^n
L\\left( y_i, w^\\top x_i + b\\right) + \\psi(w),
where :math:`L` is a classification loss, :math:`\\psi` is a
regularization function (or constraint), :math:`w` is a p-dimensional
vector representing model parameters, and b is an optional
unregularized intercept. We expect binary labels in {-1,+1}.
Parameters
----------
loss: string, default='square'
Loss function to be used. Possible choices are
- 'square' => :math:`L(y,z) = \\frac{1}{2} ( y-z)^2`
- 'logistic' => :math:`L(y,z) = \\log(1 + e^{-y z} )`
- 'sqhinge' or 'squared_hinge' => :math:`L(y,z) = \\frac{1}{2} \\max( 0, 1- y z)^2`
- 'safe-logistic' => :math:`L(y,z) = e^{ yz - 1 } - y z ~\\text{if}~ yz \\leq 1~~\\text{and}~~0` otherwise
penalty: string, default='l2'
Regularization function psi. Possible choices are
- 'none' => :math:`\\psi(w) = 0`
- 'l2' => :math:`\\psi(w) = \\frac{\\lambda}{2} \\|w\\|_2^2`
- 'l1' => :math:`\\psi(w) = \\lambda \\|w\\|_1`
- 'elastic-net' => :math:`\\psi(w) = \\lambda \\|w\\|_1 + \\frac{\\lambda_2}{2}\\|w\\|_2^2`
- 'fused-lasso' => :math:`\\psi(w) = \\lambda \\sum_{i=2}^p |w[i]-w[i-1]| + \\lambda_2\\|w\\|_1 + \\frac{\\lambda_3}{2}\\|w\\|_2^2`
- 'l1-ball' => encodes the constraint :math:`\\|w\\|_1 \\leq \\lambda`
- 'l2-ball' => encodes the constraint :math:`\\|w\\|_2 \\leq \\lambda`
fit_intercept: boolean, default='False'
learns an unregularized intercept b
"""
[docs] def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
restart=False, nthreads=-1, seed=0):
"""
The fitting function (the one that does the job)
Parameters
----------
X: numpy array, or scipy sparse CSR matrix
input n x p numpy matrix; the samples are on the rows
y: labels, numpy array.
- vector of size n with {-1,+1} labels for binary classification,
which will be automatically converted if labels in {0,1} are
provided
lambd: float, default=0
first regularization parameter :math:`\\lambda`
lambd2: float, default=0
second regularization parameter :math:`\\lambda_2`, if needed
lambd3: float, default=0
third regularization parameter :math:`\\lambda_3`, if needed
solver: string, default='auto'
Optimization solver. Possible choices are
- 'ista'
- 'fista'
- 'catalyst-ista'
- 'qning-ista' (proximal quasi-Newton method)
- 'svrg'
- 'catalyst-svrg' (accelerated SVRG with Catalyst)
- 'qning-svrg' (quasi-Newton SVRG)
- 'acc-svrg' (SVRG with direct acceleration)
- 'miso'
- 'catalyst-miso' (accelerated MISO with Catalyst)
- 'qning-miso' (quasi-Newton MISO)
- 'auto'
see the Latex documentation for more details.
If you are unsure, use 'auto'
tol: float, default='1e-3'
Tolerance parameter. For almost all combinations of loss and
penalty functions, this parameter is based on a duality gap.
Assuming the (non-negative) objective function is :math:`f` and its
optimal value is :math:`f^*`, the algorithm stops with the
guarantee
.. math::
f(x_t) - f^* \\leq tol f(x_t)
max_epochs: int, default=500
Maximum number of iteration of the algorithm in terms of passes
over the data
it0: int, default=10
Frequency of duality-gap computation
verbose: boolean, default=True
Display information or not
nthreads: int, default=-1
maximum number of cores the method may use (-1 = all cores).
Note that more cores is not always better.
seed: int, default=0
random seed
restart: boolean, default=False
use a restart strategy (useful for computing regularization path)
univariate: boolean, default=True
univariate or multivariate problems
l_qning: int, default=20
memory parameter for the qning method
f_restart: int, default=50
restart strategy for fista
Returns
-------
numpy array
information about the optimization process (number of iterations,
objective function values, duality gap) will be documented in the
future if people ask me.
"""
y = np.squeeze(y)
uniq = np.unique(y)
nb_classes = len(uniq)
if nb_classes != 2:
raise ValueError("{} classes detected; use MulticlassClassifier instead".format(nb_classes))
if not np.all(uniq == [-1, 1]):
print("You have called BinaryClassifier, labels should be either "
"-1 or 1, but they are")
print(np.unique(y))
print("Automatic conversion to [-1,1]")
neg = y == uniq[0]
y[neg] = -1
y[np.logical_not(neg)] = 1
return super().fit(X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3,
solver=solver, tol=tol, it0=it0,
max_epochs=max_epochs, l_qning=l_qning,
f_restart=f_restart, verbose=verbose,
restart=restart, univariate=True, nthreads=nthreads,
seed=seed)
[docs] def predict(self, X):
pred = X.dot(self.w)
if self.fit_intercept:
pred += self.b
return np.sign(pred)
[docs] def score(self, X, y):
"""Compute classification accuracy of the model for new test data (X,y)
"""
y = np.squeeze(y)
uniq = np.unique(y)
if not np.all(uniq == [-1, 1]):
print("You have called BinaryClassifier, labels should be either "
"-1 or 1, but they are")
print(np.unique(y))
print("Automatic conversion to [-1,1]")
neg = y == uniq[0]
y[neg] = -1
y[np.logical_not(neg)] = 1
pred = np.squeeze(self.predict(X))
return np.sum(np.squeeze(y) == pred)/pred.shape[0]
[docs]class Regression(ERM):
"""
The regression class. The objective is the same as for the BinaryClassifier
class, but we use a regression loss only (see below), and the targets will
be real values.
Parameters
----------
loss: string, default='square'
Only the square loss is implemented at this point
- 'square' => :math:`L(y,z) = \\frac{1}{2} ( y-z)^2`
penalty: string, default='l2'
same as for the class BinaryClassifier
fit_intercept: boolean, default='False'
learns an unregularized intercept b
"""
def __init__(self, loss='square', penalty='l2', fit_intercept=False):
if loss != 'square':
print("square loss should be used")
return
super().__init__(loss=loss, penalty=penalty,
fit_intercept=fit_intercept)
[docs] def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
restart=False, nthreads=-1, seed=0):
"""
The fitting function is the same as for the class BinaryClassifier,
except that we do not necessarily expect binary labels in y.
"""
return super().fit(X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3,
solver=solver, tol=tol, it0=it0,
max_epochs=max_epochs, l_qning=l_qning,
f_restart=f_restart, verbose=verbose,
restart=restart, univariate=True, nthreads=nthreads,
seed=seed)
[docs] def predict(self, X):
pred = X.dot(self.w)
if self.fit_intercept:
pred += self.b
return pred
[docs]class MultiClassifier(ERM):
"""The multi-class classification class. The goal is to minimize the
following objective
.. math::
\\min_{W,b} \\frac{1}{n} \\sum_{i=1}^n
L\\left( y_i, W^\\top x_i + b\\right) + \\psi(W),
where :math:`L` is a classification loss, :math:`\\psi` is a regularization
function (or constraint), :math:`W=[w_1,\\ldots,w_k]` is a (p x k) matrix
that carries the k predictors, where k is the number of classes, and
:math:`y_i` is a label in :math:`\\{1,\\ldots,k\\}`.
b is a k-dimensional vector representing an unregularized intercept
(which is optional).
Parameters
----------
loss: string, default='square'
Loss function to be used. Possible choices are
- any loss function compatible with the class BinaryClassifier e.g.
('square', 'logistic', 'sqhinge', 'safe-logistic'). In such a case,
the loss function encodes a one vs. all strategy based on the
chosen binary-classification loss.
- 'multiclass-logistic', which is also called multinomial or
softmax logistic:
.. math::
L(y, W^\\top x + b) = \\sum_{j=1}^k
\\log\\left(e^{w_j^\\top + b_j} - e^{w_y^\\top + b_y} \\right)
penalty: string, default='l2'
Regularization function psi. Possible choices are
- any penalty function compatible with the class BinaryClassifier such
as ('none', 'l2', 'l1', 'elastic-net', 'fused-lasso', 'l1-ball',
'l2-ball'). In such a case,. the penalty is applied on each predictor
:math:`w_j` individually:
.. math::
\\psi(W) = \\sum_{j=1}^k \\psi(w_j).
- 'l1l2', which is the multi-task group Lasso regularization
.. math::
\\psi(W) = \\lambda \\sum_{j=1}^p \\|W^j\\|_2~~~~
\\text{where}~W^j~\\text{is the j-th row of}~W.
- 'l1linf'
.. math::
\\psi(W) = \\lambda \\sum_{j=1}^p \\|W^j\\|_\\infty.
- 'l1l2+l1', which is the multi-task group Lasso regularization + l1
.. math::
\\psi(W) = \\sum_{j=1}^p \\lambda \\|W^j\\|_2 + \\lambda_2 \|W^j\|_1 ~~~~
\\text{where}~W^j~\\text{is the j-th row of}~W.
fit_intercept: boolean, default='False'
learns an unregularized intercept b, which is a k-dimensional vector
"""
[docs] def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
restart=False, nthreads=-1, seed=0):
"""Same as BinaryClassifier, but y should be a vector a n-dimensional
vector of integers
"""
y = np.squeeze(y)
if y.squeeze().ndim != 1 or np.any(y != y.astype(int)):
print("y should be a n-dimensional vector of integers")
return
nclasses = np.max(y)+1
uniqu = np.unique(y)
if nclasses == 2:
print("Two classes detected, use BinaryClassifier instead")
return
if (nclasses != uniqu.shape[0] or
not all(np.unique(y) == np.arange(nclasses))):
print("Class labels should be of the form")
print(np.arange(nclasses))
print("but they are")
print(uniqu)
return
return super().fit(
X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3, solver=solver,
tol=tol, it0=it0, max_epochs=max_epochs, l_qning=l_qning,
f_restart=f_restart, verbose=verbose, restart=restart,
univariate=False, nthreads=nthreads, seed=seed)
[docs] def predict(self, X):
"""Predicts the class label"""
pred = X.dot(self.w)
if self.fit_intercept:
pred += self.b[np.newaxix, :]
return np.argmax(pred, 1)
[docs] def score(self, X, y):
"""Gives a classification score on new test data"""
pred = np.squeeze(self.predict(X))
return np.sum(np.squeeze(y) == pred)/pred.shape[0]
[docs]class MultiVariateRegression(ERM):
"""
The multivariate regression class. The objective is the same as for the
MultiClassifier class, but we use a regression loss only (see below), and
the targets :math:`y_i` are k-dimensional vectors.
Parameters
----------
loss: string, default='square'
Only the square loss is implemented at this point. Given two
k-dimensional vectors y,z:
- 'square' => :math:`L(y,z) = \\frac{1}{2} \\|y-z\\|^2`
penalty: string, default='l2'
same as for the class MultiClassifier
fit_intercept: boolean, default='False'
learns an unregularized intercept b
"""
def __init__(self, loss='square', penalty='l2', fit_intercept=False):
if loss != 'square':
print("square loss should be used")
return
super().__init__(loss=loss, penalty=penalty,
fit_intercept=fit_intercept)
[docs] def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
restart=False, nthreads=-1, seed=0):
"""Same as ERM.fit, but y should be n x k, where k is size of the
target for each data point
"""
if y.squeeze().ndim <= 1:
print("y should be n x k, where k is size of the target "
"for each data point")
return
return super().fit(
X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3, solver=solver,
tol=tol, it0=it0, max_epochs=max_epochs, l_qning=l_qning,
f_restart=f_restart, verbose=verbose, restart=restart,
univariate=False, nthreads=nthreads, seed=seed)
[docs] def predict(self, X):
"""Predicts the targets"""
pred = X.dot(self.w)
if self.fit_intercept:
pred += self.b[np.newaxix, :]
return pred
[docs]class LinearSVC(BinaryClassifier):
"""
A compatibility class for scikit-learn user, but only for square hinge loss
It is perfectly equivalent to the BinaryClassifier class, but the
regularization parameter (here "C") is provided during the class
initialization. Note that :math:`C= \\frac{1}{2n \\lambda}`
Parameters
----------
loss: should be 'sqhinge' or 'squared_hinge'
penalty: same as BinaryClassifier
fit_intercept: same as BinaryClassifier
C: regularization parameter
max_iter: maximum number of iterations for the optimization solver
"""
def __init__(self, loss='sqhinge', penalty='l2', fit_intercept=False, C=1,
max_iter=500):
if loss != 'sqhinge' and loss != 'squared_hinge':
print("LinearSVC is only compatible with squared hinge loss at "
"the moment")
super().__init__(loss='sqhinge', penalty=penalty,
fit_intercept=fit_intercept, max_epochs=max_iter)
self.C = C
self.max_iter = max_iter
[docs] def fit(self, X, y, C=None, verbose=None, lambd2=0, lambd3=0,
solver='auto', tol=1e-3, it0=10, max_iter=None, l_qning=20,
f_restart=50, restart=False, nthreads=-1, seed=0):
"""Same as BinaryClassification, but the parameter C replaces lambd,
and max_iter replaces max_epochs.
"""
n = X.shape[0]
if C is not None:
self.C = C
if verbose is not None:
self.verbose = verbose
if max_iter is not None:
self.max_iter = max_iter
super().fit(
X=X, y=y, lambd=1/(2*n*self.C), lambd2=lambd2, lambd3=lambd3,
solver=solver, tol=tol, it0=it0, max_epochs=max_iter,
l_qning=l_qning, f_restart=f_restart, verbose=verbose,
restart=restart, nthreads=nthreads, seed=seed)
[docs]class LogisticRegression(BinaryClassifier):
"""
A compatibility class for scikit-learn user, but only for square hinge loss
It is perfectly equivalent to the BinaryClassifier class, but the
regularization parameter (here "C") is provided during the class
initialization. Note that :math:`C= \\frac{1}{n \\lambda}`
Parameters
----------
loss: should be 'sqhinge' or 'squared_hinge'
penalty: same as BinaryClassifier
fit_intercept: same as BinaryClassifier
C: regularization parameter
max_iter: maximum number of iterations for the optimization solver
"""
def __init__(self, penalty='l2', fit_intercept=False, C=1, max_iter=500):
super().__init__(loss='logistic', penalty=penalty,
fit_intercept=fit_intercept, max_epochs=max_iter)
self.C = C
self.max_iter = max_iter
[docs] def fit(self, X, y, C=None, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
it0=10, max_iter=None, l_qning=20, f_restart=50, verbose=None,
restart=False, nthreads=-1, seed=0):
"""
Same as BinaryClassification, but the parameter C replaces lambd,
and max_iter replaces max_epochs.
"""
n = X.shape[0]
if C is not None:
self.C = C
if verbose is not None:
self.verbose = verbose
if max_iter is not None:
self.max_iter = max_iter
super().fit(
X=X, y=y, lambd=1/(n*self.C), lambd2=lambd2, lambd3=lambd3,
solver=solver, tol=tol, it0=it0, max_epochs=max_iter,
l_qning=l_qning, f_restart=f_restart, verbose=verbose,
restart=restart, nthreads=nthreads, seed=seed)