Source code for cyanure

# Author: Julien Mairal <julien.mairal@inria.fr>
#
# License: BSD 3 clause

from abc import abstractmethod
import numpy as np
import scipy.sparse
import cyanure_wrap


def preprocess(X, centering=False, normalize=True, columns=False):
    """Perform in-place centering or normalization, either of columns or rows
    of the input matrix X

    Parameters
    ----------
    X : numpy array, or scipy sparse CSR matrix
        input matrix

    centering : boolean, default=False
        perform a centering operation

    normalize : boolean, default=True
        l2-normalization

    columns : boolean, default=False
        operates on rows (False) or columns (True).
    """

    if scipy.sparse.issparse(X):
        Xf = X.T
    else:
        Xf = np.asfortranarray(X.T)
    return cyanure_wrap.preprocess_(Xf, centering, normalize, not columns)


class ERM:
    """The generic class for empirical risk minimization problems.
    For univariates problems, minimizes

        min_{w,b} (1/n) sum_{i=1}^n L( y_i, <w, x_i> + b)   + psi(w)

    """

    def __init__(self, loss='square', penalty='l2', fit_intercept=False):
        """Initialization function of the ERM class.

        Parameters
        ----------
        loss : string, default='square'
            Loss function to be used. Possible choices are
            - 'square' =>  L(y,z) = 0.5 ( y-z)^2
            - 'logistic' => L(y,z) = log(1 + e^{-y z} )
            - 'sqhinge' or 'squared_hinge' => L(y,z) = 0.5 max( 0, 1- y z)^2
            - 'safe-logistic' => L(y,z) = e^{ yz - 1 } - y z  if yz <= 1
                                 and 0 otherwise
            - 'multiclass-logistic' => multinomial logistic (see Latex
                                       documentation).
            Note that for binary classification, we assume the labels to be of
            the form {-1,+1}

        penalty: string, default='none'
            Regularization function psi. Possible choices are

            For univariate problems
            - 'none' => psi(w) = 0
            - 'l2' =>  psi{w) = (lambd/2) ||w||_2^2
            - 'l1' =>  psi{w) = lambd ||w||_1
            - 'elastic-net' =>  psi{w) = lambd ||w||_1 + (lambd2/2)||w||_2^2
            - 'fused-lasso' => psi(w) = lambd sum_{i=2}^p |w[i]-w[i-1]|
                                      + lambd2||w||_1 + (lambd3/2)||w||_2^2
            - 'l1-ball'     => encodes the constraint ||w||_1 <= lambd
            - 'l2-ball'     => encodes the constraint ||w||_2 <= lambd

            For multivariate problems, the previous penalties operate on each
            individual (e.g., class) predictor.
            In addition, multitask-group Lasso penalties are provided for
            multivariate problems (w is then a matrix)
            - 'l1l2' or 'l1linf', see Latex documentation

        fit_intercept: boolean, default='False'
            learns an unregularized intercept b  (or several intercepts for
            multivariate problems)
        """

        self.loss = loss
        if (loss == 'squared_hinge'):
            self.loss = 'sqhinge'
        self.penalty = penalty
        self.fit_intercept = fit_intercept
        self.w = 0
        self.b = 0
        self.dual_variable = None

    def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
            it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
            restart=False, univariate=True, nthreads=-1, seed=0):
        """
        The fitting function (the one that does the job)

        Parameters
        ----------

        X : numpy array, or scipy sparse CSR matrix
            input n x p numpy matrix; the samples are on the rows

        y : labels, numpy array.
            - vector of size n with real values for regression
            - vector of size n with {-1,+1} labels for binary classification,
              which will be automatically converted if labels in {0,1} are
              provided
            - matrix of size n x k for multivariate regression
            - vector of size n with entries in {0,1,k-1} for classification
              with k classes

        lambd: float, default=0
            first regularization parameter

        lambd2: float, default=0
            second regularization parameter, if needed

        lambd3: float, default=0
            third regularization parameter, if needed

        solver: string, default='auto'
            Optimization solver. Possible choices are
            - 'ista'
            - 'fista'
            - 'catalyst-ista'
            - 'qning-ista'  (proximal quasi-Newton method)
            - 'svrg'
            - 'catalyst-svrg' (accelerated SVRG with Catalyst)
            - 'qning-svrg'  (quasi-Newton SVRG)
            - 'acc-svrg'    (SVRG with direct acceleration)
            - 'miso'
            - 'catalyst-miso' (accelerated MISO with Catalyst)
            - 'qning-miso'  (quasi-Newton MISO)
            - 'auto'
            see the Latex documentation for more details.
            If you are unsure, use 'auto'

         tol: float, default='1e-3'
            Tolerance parameter. For almost all combinations of loss and
            penalty functions, this parameter is based on a duality gap.
            Assuming the (non-negative) objective function is "f" and its
            optimal value is "f^*", the algorithm stops with the guarantee

            f(x_t) - f^*  <=  tol f(x_t)

         max_epochs: int, default=500
            Maximum number of iteration of the algorithm in terms of passes
            over the data

         it0: int, default=10
            Frequency of duality-gap computation

         verbose: boolean, default=True
            Display information or not

         nthreads: int, default=-1
            maximum number of cores the method may use (-1 = all cores).
            Note that more cores is not always better.

         seed: int, default=0
            random seed

         restart: boolean, default=False
            use a restart strategy (useful for computing regularization path)

         univariate: boolean, default=True
            univariate or multivariate problems

         l_qning: int, default=20
            memory paramter for the qning method

         f_restart: int, default=50
            restart strategy for fista


        Returns
        -------

        test returns a numpy array carrying information about the optimization
        process (number of iterations, objective function values, duality gap)
        will be documented in the future if people ask me,
        """

        if y.ndim == 0:
            print("Please provide label vector")
            return
        Xf = X.T if scipy.sparse.issparse(X) else np.asfortranarray(X.T)
        p = X.shape[1]+1 if self.fit_intercept else X.shape[1]
        y = np.squeeze(y)
        if y.shape[0] != X.shape[0]:
            print("X and y should have the same number of observations")
            return
        if univariate:
            w0 = np.zeros(p, dtype=Xf.dtype)
            yf = np.squeeze(y.astype(Xf.dtype))
        else:
            if y.squeeze().ndim > 1:
                nclasses = y.squeeze().shape[1]
                yf = np.asfortranarray(y.T)
            else:
                nclasses = int(np.max(y)+1)
                yf = np.squeeze(np.int32(y))
            w0 = np.zeros([p, nclasses], dtype=Xf.dtype, order='F')

        if restart and np.any(self.w != 0):
            print("Restart")
            if self.fit_intercept:
                w0[-1, ] = self.b
                w0[0:-1, ] = self.w
            else:
                w0 = self.w

        if restart and (solver == 'auto' or solver == 'miso' or
                        solver == 'catalyst-miso' or solver == 'qning-miso'):
            n = X.shape[0]
            reset_dual = np.any(self.dual_variable is None)
            if not reset_dual and univariate:
                reset_dual = self.dual_variable.shape[0] != n
            if not reset_dual and not univariate:
                reset_dual = np.any(self.dual_variable.shape != [n, nclasses])
            if reset_dual and univariate:
                self.dual_variable = np.zeros(n, dtype=Xf.dtype)
            if reset_dual and not univariate:
                self.dual_variable = np.zeros([n, nclasses], dtype=Xf.dtype)

        w = np.copy(w0)
        optim_info = cyanure_wrap.erm_(
            Xf, yf, w0, w, dual_variable=self.dual_variable, loss=self.loss,
            penalty=self.penalty, solver=solver, lambd=float(lambd),
            lambd2=float(lambd2), lambd3=float(lambd3),
            intercept=bool(self.fit_intercept), tol=float(tol), it0=int(it0),
            nepochs=int(max_epochs), l_qning=int(l_qning),
            f_restart=int(f_restart), verbose=bool(verbose),
            univariate=bool(univariate), nthreads=int(nthreads), seed=int(seed)
        )
        if self.fit_intercept:
            self.b = w[-1, ]
            self.w = w[0:-1, ]
        else:
            self.w = w
        return optim_info

    @abstractmethod
    def predict(self, X):
        """
        predict the labels given an input matrix X (same format as fit)
        """
        pass

    def get_weights(self):
        """
        get the model parameters (either w or the tuple (w,b))
        """
        return (self.w, self.b) if self.fit_intercept else self.w

    def eval(self, X, y, lambd=0, lambd2=0, lambd3=0):
        """
        get the value of the objective function and computes a relative
        duality gap, see function fit for the format of parameters.
        """
        return self.fit(X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3,
                        max_epochs=0, verbose=False, restart=True)


[docs]class BinaryClassifier(ERM):
    """
    The binary classification class, which derives from ERM. The goal is to
    minimize the following objective

        .. math::
            \\min_{w,b} \\frac{1}{n} \\sum_{i=1}^n
            L\\left( y_i, w^\\top x_i + b\\right) + \\psi(w),

        where :math:`L` is a classification loss, :math:`\\psi` is a
        regularization function (or constraint), :math:`w` is a p-dimensional
        vector representing model parameters, and b is an optional
        unregularized intercept. We expect binary labels in {-1,+1}.

        Parameters
        ----------
        loss: string, default='square'
            Loss function to be used. Possible choices are

            - 'square' =>  :math:`L(y,z) = \\frac{1}{2} ( y-z)^2`
            - 'logistic' => :math:`L(y,z) = \\log(1 + e^{-y z} )`
            - 'sqhinge' or 'squared_hinge' => :math:`L(y,z) = \\frac{1}{2} \\max( 0, 1- y z)^2`
            - 'safe-logistic' => :math:`L(y,z) = e^{ yz - 1 } - y z ~\\text{if}~ yz \\leq 1~~\\text{and}~~0` otherwise

        penalty: string, default='l2'
            Regularization function psi. Possible choices are

            - 'none' => :math:`\\psi(w) = 0`
            - 'l2' =>  :math:`\\psi(w) = \\frac{\\lambda}{2} \\|w\\|_2^2`
            - 'l1' =>  :math:`\\psi(w) = \\lambda \\|w\\|_1`
            - 'elastic-net' =>  :math:`\\psi(w) = \\lambda \\|w\\|_1 + \\frac{\\lambda_2}{2}\\|w\\|_2^2`
            - 'fused-lasso' => :math:`\\psi(w) = \\lambda \\sum_{i=2}^p |w[i]-w[i-1]| + \\lambda_2\\|w\\|_1 + \\frac{\\lambda_3}{2}\\|w\\|_2^2`
            - 'l1-ball'     => encodes the constraint :math:`\\|w\\|_1 \\leq \\lambda`
            - 'l2-ball'     => encodes the constraint :math:`\\|w\\|_2 \\leq \\lambda`

        fit_intercept: boolean, default='False'
            learns an unregularized intercept b

    """

[docs]    def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
            it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
            restart=False, nthreads=-1, seed=0):
        """
        The fitting function (the one that does the job)

        Parameters
        ----------

        X: numpy array, or scipy sparse CSR matrix
            input n x p numpy matrix; the samples are on the rows

        y: labels, numpy array.
            - vector of size n with {-1,+1} labels for binary classification,
              which will be automatically converted if labels in {0,1} are
              provided

        lambd: float, default=0
            first regularization parameter :math:`\\lambda`

        lambd2: float, default=0
            second regularization parameter :math:`\\lambda_2`, if needed

        lambd3: float, default=0
            third regularization parameter :math:`\\lambda_3`, if needed

        solver: string, default='auto'
            Optimization solver. Possible choices are

            - 'ista'
            - 'fista'
            - 'catalyst-ista'
            - 'qning-ista'  (proximal quasi-Newton method)
            - 'svrg'
            - 'catalyst-svrg' (accelerated SVRG with Catalyst)
            - 'qning-svrg'  (quasi-Newton SVRG)
            - 'acc-svrg'    (SVRG with direct acceleration)
            - 'miso'
            - 'catalyst-miso' (accelerated MISO with Catalyst)
            - 'qning-miso'  (quasi-Newton MISO)
            - 'auto'

            see the Latex documentation for more details.
            If you are unsure, use 'auto'

         tol: float, default='1e-3'
            Tolerance parameter. For almost all combinations of loss and
            penalty functions, this parameter is based on a duality gap.
            Assuming the (non-negative) objective function is :math:`f` and its
            optimal value is :math:`f^*`, the algorithm stops with the
            guarantee

            .. math::
                f(x_t) - f^*  \\leq  tol f(x_t)

         max_epochs: int, default=500
            Maximum number of iteration of the algorithm in terms of passes
            over the data

         it0: int, default=10
            Frequency of duality-gap computation

         verbose: boolean, default=True
            Display information or not

         nthreads: int, default=-1
            maximum number of cores the method may use (-1 = all cores).
            Note that more cores is not always better.

         seed: int, default=0
            random seed

         restart: boolean, default=False
            use a restart strategy (useful for computing regularization path)

         univariate: boolean, default=True
            univariate or multivariate problems

         l_qning: int, default=20
            memory parameter for the qning method

         f_restart: int, default=50
            restart strategy for fista


        Returns
        -------
        numpy array
            information about the optimization process (number of iterations,
            objective function values, duality gap) will be documented in the
            future if people ask me.
        """

        y = np.squeeze(y)
        uniq = np.unique(y)
        nb_classes = len(uniq)
        if nb_classes != 2:
            raise ValueError("{} classes detected; use MulticlassClassifier instead".format(nb_classes))
        
        if not np.all(uniq == [-1, 1]):
            print("You have called BinaryClassifier, labels should be either "
                  "-1 or 1, but they are")
            print(np.unique(y))
            print("Automatic conversion to [-1,1]")
            neg = y == uniq[0]
            y[neg] = -1
            y[np.logical_not(neg)] = 1
        return super().fit(X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3,
                           solver=solver, tol=tol, it0=it0,
                           max_epochs=max_epochs, l_qning=l_qning,
                           f_restart=f_restart, verbose=verbose,
                           restart=restart, univariate=True, nthreads=nthreads,
                           seed=seed)

[docs]    def predict(self, X):
        pred = X.dot(self.w)
        if self.fit_intercept:
            pred += self.b
        return np.sign(pred)

[docs]    def score(self, X, y):
        """Compute classification accuracy of the model for new test data (X,y)
        """
        y = np.squeeze(y)
        uniq = np.unique(y)
        if not np.all(uniq == [-1, 1]):
            print("You have called BinaryClassifier, labels should be either "
                  "-1 or 1, but they are")
            print(np.unique(y))
            print("Automatic conversion to [-1,1]")
            neg = y == uniq[0]
            y[neg] = -1
            y[np.logical_not(neg)] = 1
        pred = np.squeeze(self.predict(X))
        return np.sum(np.squeeze(y) == pred)/pred.shape[0]


[docs]class Regression(ERM):
    """
    The regression class. The objective is the same as for the BinaryClassifier
    class, but we use a regression loss only (see below), and the targets will
    be real values.

    Parameters
    ----------
    loss: string, default='square'
        Only the square loss is implemented at this point

        - 'square' =>  :math:`L(y,z) = \\frac{1}{2} ( y-z)^2`

    penalty: string, default='l2'
        same as for the class BinaryClassifier

    fit_intercept: boolean, default='False'
        learns an unregularized intercept b

    """

    def __init__(self, loss='square', penalty='l2', fit_intercept=False):
        if loss != 'square':
            print("square loss should be used")
            return
        super().__init__(loss=loss, penalty=penalty,
                         fit_intercept=fit_intercept)

[docs]    def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
            it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
            restart=False, nthreads=-1, seed=0):
        """
        The fitting function is the same as for the class BinaryClassifier,
        except that we do not necessarily expect binary labels in y.
        """

        return super().fit(X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3,
                           solver=solver, tol=tol, it0=it0,
                           max_epochs=max_epochs, l_qning=l_qning,
                           f_restart=f_restart, verbose=verbose,
                           restart=restart, univariate=True, nthreads=nthreads,
                           seed=seed)

[docs]    def predict(self, X):
        pred = X.dot(self.w)
        if self.fit_intercept:
            pred += self.b
        return pred


[docs]class MultiClassifier(ERM):
    """The multi-class classification class. The goal is to minimize the
    following objective

    .. math::
        \\min_{W,b} \\frac{1}{n} \\sum_{i=1}^n
        L\\left( y_i, W^\\top x_i + b\\right)   + \\psi(W),

    where :math:`L` is a classification loss, :math:`\\psi` is a regularization
    function (or constraint), :math:`W=[w_1,\\ldots,w_k]` is a (p x k) matrix
    that carries the k predictors, where k is the number of classes, and
    :math:`y_i` is a label in :math:`\\{1,\\ldots,k\\}`.
    b is a k-dimensional vector representing an unregularized intercept
    (which is optional).

    Parameters
    ----------
    loss: string, default='square'
        Loss function to be used. Possible choices are

        - any loss function compatible with the class BinaryClassifier e.g.
            ('square', 'logistic', 'sqhinge', 'safe-logistic'). In such a case,
            the loss function encodes a one vs. all strategy based on the
            chosen binary-classification loss.
        - 'multiclass-logistic', which is also called multinomial or
            softmax logistic:

        .. math::
            L(y, W^\\top x + b) = \\sum_{j=1}^k
            \\log\\left(e^{w_j^\\top + b_j} - e^{w_y^\\top + b_y} \\right)

    penalty: string, default='l2'
        Regularization function psi. Possible choices are

        - any penalty function compatible with the class BinaryClassifier such
          as ('none', 'l2', 'l1', 'elastic-net', 'fused-lasso', 'l1-ball',
          'l2-ball'). In such a case,. the penalty is applied on each predictor
           :math:`w_j` individually:

        .. math::
            \\psi(W) = \\sum_{j=1}^k \\psi(w_j).
        - 'l1l2', which is the multi-task group Lasso regularization

        .. math::
            \\psi(W) = \\lambda \\sum_{j=1}^p \\|W^j\\|_2~~~~
            \\text{where}~W^j~\\text{is the j-th row of}~W.
        - 'l1linf'

        .. math::
            \\psi(W) = \\lambda \\sum_{j=1}^p \\|W^j\\|_\\infty.

        - 'l1l2+l1', which is the multi-task group Lasso regularization + l1

        .. math::
            \\psi(W) = \\sum_{j=1}^p \\lambda \\|W^j\\|_2 + \\lambda_2 \|W^j\|_1 ~~~~
            \\text{where}~W^j~\\text{is the j-th row of}~W.


    fit_intercept: boolean, default='False'
        learns an unregularized intercept b, which is a k-dimensional vector
    """

[docs]    def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
            it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
            restart=False, nthreads=-1, seed=0):
        """Same as BinaryClassifier, but y should be a vector a n-dimensional
        vector of integers
        """
        y = np.squeeze(y)
        if y.squeeze().ndim != 1 or np.any(y != y.astype(int)):
            print("y should be a n-dimensional vector of integers")
            return
        nclasses = np.max(y)+1
        uniqu = np.unique(y)
        if nclasses == 2:
            print("Two classes detected, use BinaryClassifier instead")
            return
        if (nclasses != uniqu.shape[0] or
                not all(np.unique(y) == np.arange(nclasses))):
            print("Class labels should be of the form")
            print(np.arange(nclasses))
            print("but they are")
            print(uniqu)
            return
        return super().fit(
            X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3, solver=solver,
            tol=tol, it0=it0, max_epochs=max_epochs, l_qning=l_qning,
            f_restart=f_restart, verbose=verbose, restart=restart,
            univariate=False, nthreads=nthreads, seed=seed)

[docs]    def predict(self, X):
        """Predicts the class label"""
        pred = X.dot(self.w)
        if self.fit_intercept:
            pred += self.b[np.newaxix, :]
        return np.argmax(pred, 1)

[docs]    def score(self, X, y):
        """Gives a classification score on new test data"""
        pred = np.squeeze(self.predict(X))
        return np.sum(np.squeeze(y) == pred)/pred.shape[0]


[docs]class MultiVariateRegression(ERM):
    """
    The multivariate regression class. The objective is the same as for the
    MultiClassifier class, but we use a regression loss only (see below), and
    the targets :math:`y_i` are k-dimensional vectors.

        Parameters
        ----------
        loss: string, default='square'
            Only the square loss is implemented at this point. Given two
            k-dimensional vectors y,z:

            - 'square' =>  :math:`L(y,z) = \\frac{1}{2} \\|y-z\\|^2`

        penalty: string, default='l2'
            same as for the class MultiClassifier

        fit_intercept: boolean, default='False'
            learns an unregularized intercept b
    """

    def __init__(self, loss='square', penalty='l2', fit_intercept=False):
        if loss != 'square':
            print("square loss should be used")
            return
        super().__init__(loss=loss, penalty=penalty,
                         fit_intercept=fit_intercept)

[docs]    def fit(self, X, y, lambd=0, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
            it0=10, max_epochs=500, l_qning=20, f_restart=50, verbose=True,
            restart=False, nthreads=-1, seed=0):
        """Same as ERM.fit, but y should be n x k, where k is size of the
        target for each data point
        """
        if y.squeeze().ndim <= 1:
            print("y should be n x k, where k is size of the target "
                  "for each data point")
            return
        return super().fit(
            X, y, lambd=lambd, lambd2=lambd2, lambd3=lambd3, solver=solver,
            tol=tol, it0=it0, max_epochs=max_epochs, l_qning=l_qning,
            f_restart=f_restart, verbose=verbose, restart=restart,
            univariate=False, nthreads=nthreads, seed=seed)

[docs]    def predict(self, X):
        """Predicts the targets"""
        pred = X.dot(self.w)
        if self.fit_intercept:
            pred += self.b[np.newaxix, :]
        return pred


[docs]class LinearSVC(BinaryClassifier):
    """
    A compatibility class for scikit-learn user, but only for square hinge loss
    It is perfectly equivalent to the BinaryClassifier class, but the
    regularization parameter (here "C") is provided during the class
    initialization. Note that :math:`C= \\frac{1}{2n \\lambda}`

    Parameters
    ----------

    loss: should be 'sqhinge' or 'squared_hinge'

    penalty: same as BinaryClassifier

    fit_intercept: same as BinaryClassifier

    C: regularization parameter

    max_iter: maximum number of iterations for the optimization solver
    """

    def __init__(self, loss='sqhinge', penalty='l2', fit_intercept=False, C=1,
                 max_iter=500):
        if loss != 'sqhinge' and loss != 'squared_hinge':
            print("LinearSVC is only compatible with squared hinge loss at "
                  "the moment")
        super().__init__(loss='sqhinge', penalty=penalty,
                         fit_intercept=fit_intercept, max_epochs=max_iter)
        self.C = C
        self.max_iter = max_iter

[docs]    def fit(self, X, y, C=None, verbose=None, lambd2=0, lambd3=0,
            solver='auto', tol=1e-3, it0=10, max_iter=None, l_qning=20,
            f_restart=50, restart=False, nthreads=-1, seed=0):
        """Same as BinaryClassification, but the parameter C replaces lambd,
        and max_iter replaces max_epochs.
        """
        n = X.shape[0]
        if C is not None:
            self.C = C
        if verbose is not None:
            self.verbose = verbose
        if max_iter is not None:
            self.max_iter = max_iter
        super().fit(
            X=X, y=y, lambd=1/(2*n*self.C), lambd2=lambd2, lambd3=lambd3,
            solver=solver, tol=tol, it0=it0, max_epochs=max_iter,
            l_qning=l_qning, f_restart=f_restart, verbose=verbose,
            restart=restart, nthreads=nthreads, seed=seed)


[docs]class LogisticRegression(BinaryClassifier):
    """
    A compatibility class for scikit-learn user, but only for square hinge loss
    It is perfectly equivalent to the BinaryClassifier class, but the
    regularization parameter (here "C") is provided during the class
    initialization. Note that :math:`C= \\frac{1}{n \\lambda}`

    Parameters
    ----------

    loss: should be 'sqhinge' or 'squared_hinge'

    penalty: same as BinaryClassifier

    fit_intercept: same as BinaryClassifier

    C: regularization parameter

    max_iter: maximum number of iterations for the optimization solver
    """

    def __init__(self, penalty='l2', fit_intercept=False, C=1, max_iter=500):
        super().__init__(loss='logistic', penalty=penalty,
                         fit_intercept=fit_intercept, max_epochs=max_iter)
        self.C = C
        self.max_iter = max_iter

[docs]    def fit(self, X, y, C=None, lambd2=0, lambd3=0, solver='auto', tol=1e-3,
            it0=10, max_iter=None, l_qning=20, f_restart=50, verbose=None,
            restart=False, nthreads=-1, seed=0):
        """
        Same as BinaryClassification, but the parameter C replaces lambd,
        and max_iter replaces max_epochs.
        """
        n = X.shape[0]
        if C is not None:
            self.C = C
        if verbose is not None:
            self.verbose = verbose
        if max_iter is not None:
            self.max_iter = max_iter
        super().fit(
            X=X, y=y, lambd=1/(n*self.C), lambd2=lambd2, lambd3=lambd3,
            solver=solver, tol=tol, it0=it0, max_epochs=max_iter,
            l_qning=l_qning, f_restart=f_restart, verbose=verbose,
            restart=restart, nthreads=nthreads, seed=seed)