tick.linear_model.model_poisreg — tick 0.6.0 documentation

# License: BSD 3 clause

import numpy as np
from scipy.special import gammaln

from tick.base_model import ModelGeneralizedLinear, ModelFirstOrder, \
    ModelSecondOrder, ModelSelfConcordant
from .build.linear_model import ModelPoisRegDouble as _ModelPoisRegDouble
from .build.linear_model import ModelPoisRegFloat as _ModelPoisRegFloat
from .build.linear_model import LinkType_identity as identity
from .build.linear_model import LinkType_exponential as exponential

__author__ = 'Stephane Gaiffas'

dtype_map = {
    np.dtype('float32'): _ModelPoisRegFloat,
    np.dtype('float64'): _ModelPoisRegDouble
}


[docs]class ModelPoisReg(ModelGeneralizedLinear, ModelSecondOrder,
                   ModelSelfConcordant):
    """Poisson regression model with identity or exponential link for data with
    a count label. This class gives first order and second order information for
    this model (gradient, loss and hessian norm)  and can be passed
    to any solver through the solver's ``set_model`` method.
    Can lead to overflows with some solvers (see the note below).

    Given training data :math:`(x_i, y_i) \\in \\mathbb R^d \\times \\mathbb N`
    for :math:`i=1, \\ldots, n`, this model considers a goodness-of-fit

    .. math::
        f(w, b) = \\frac 1n \\sum_{i=1}^n \\ell(y_i, b + x_i^\\top w),

    where :math:`w \\in \\mathbb R^d` is a vector containing the model-weights,
    :math:`b \\in \\mathbb R` is the intercept (used only whenever
    ``fit_intercept=True``) and
    :math:`\\ell : \\mathbb R^2 \\rightarrow \\mathbb R` is the loss given by

    .. math::
        \\ell(y, y') = e^{y'} - y y'

    whenever ``link='exponential'`` and

    .. math::
        \\ell(y, y') = y' - y \\log(y')

    whenever ``link='identity'``,  for :math:`y \in \mathbb N` and
    :math:`y' \in \mathbb R`. Data is passed
    to this model through the ``fit(X, y)`` method where X is the features
    matrix (dense or sparse) and y is the vector of labels.

    Parameters
    ----------
    fit_intercept : `bool`
        If `True`, the model uses an intercept

    link : `str`, default="exponential"
        Type of link function

        * if ``"identity"``: the intensity is the inner product of the
          model's coeffs with the features. In this case, one
          must ensure that the intensity is non-negative

        * if ``"exponential"``: the intensity is the exponential of the
          inner product of the model's coeffs with the features.

        Note that link cannot be changed after creation of
        `ModelPoisReg`

    Attributes
    ----------
    features : {`numpy.ndarray`, `scipy.sparse.csr_matrix`}, shape=(n_samples, n_features)
        The features matrix, either dense or sparse

    labels : `numpy.ndarray`, shape=(n_samples,) (read-only)
        The labels vector

    n_samples : `int` (read-only)
        Number of samples

    n_features : `int` (read-only)
        Number of features

    n_coeffs : `int` (read-only)
        Total number of coefficients of the model

    n_threads : `int`, default=1 (read-only)
        Number of threads used for parallel computation.

        * if ``int <= 0``: the number of threads available on
          the CPU
        * otherwise the desired number of threads

    dtype : `{'float64', 'float32'}`, default='float64'
        Type of the data arrays used.

    Notes
    -----
    The gradient and loss for the exponential link case cannot be
    overflow proof. In this case, only a solver working in the dual
    (such as `SDCA`) should be used.

    In summary, use grad and call at your own risk when
    ``link="exponential"``
    """

    _attrinfos = {
        "_link_type": {
            "writable": False
        },
        "_link": {
            "writable": False
        }
    }

[docs]    def __init__(self, fit_intercept: bool = True, link: str = "exponential",
                 n_threads: int = 1):
        ModelSecondOrder.__init__(self)
        ModelGeneralizedLinear.__init__(self, fit_intercept)
        ModelSelfConcordant.__init__(self)
        self._set("_link", None)
        self.link = link
        self.n_threads = n_threads

    # TODO: implement _set_data and not fit
[docs]    def fit(self, features, labels):
        """Set the data into the model object

        Parameters
        ----------
        features : {`numpy.ndarray`, `scipy.sparse.csr_matrix`}, shape=(n_samples, n_features)
            The features matrix, either dense or sparse

        labels : `numpy.ndarray`, shape=(n_samples,)
            The labels vector

        Returns
        -------
        output : `ModelPoisReg`
            The current instance with given data
        """
        ModelFirstOrder.fit(self, features, labels)
        ModelGeneralizedLinear.fit(self, features, labels)

        self._set("_model", self._build_cpp_model(features.dtype))
        return self

    def _grad(self, coeffs: np.ndarray, out: np.ndarray) -> None:
        self._model.grad(coeffs, out)

    def _loss(self, coeffs: np.ndarray) -> float:
        return self._model.loss(coeffs)

    @property
    def link(self):
        return self._link

    @link.setter
    def link(self, value):
        if self._link is not None:
            raise ValueError("link is read only")
        if value == "exponential":
            self._set("_link_type", exponential)
        elif value == "identity":
            self._set("_link_type", identity)
        else:
            raise ValueError("``link`` must be either 'exponential' or "
                             "'linear'.")
        self._set("_link", value)

    def _get_sc_constant(self) -> float:
        """Self-concordance parameter of the Poisson
        regression loss
        """
        if self.link == "identity":
            y = self.labels
            return 2 * (1. / np.sqrt(y[y > 0])).max()
        else:
            raise ValueError(("Poisson regression with exponential "
                              "link is not self-concordant"))

    # TODO: C++ for this
    def _hessian_norm(self, coeffs: np.ndarray, point: np.ndarray) -> float:
        link = self.link
        features, labels = self.features, self.labels
        if link == "identity":
            z1 = features.dot(coeffs)
            z2 = features.dot(point)
            # TODO: beware of zeros in z1 or z2 !
            return np.sqrt((labels * z1 ** 2 / z2 ** 2).mean())
        elif link == "exponential":
            raise NotImplementedError("exp link is not yet implemented")
        else:
            raise ValueError("``link`` must be either 'exponential' or "
                             "'linear'.")

    def _sdca_primal_dual_relation(self, l_l2sq, dual_vector):
        # In order to solve the same problem than other solvers, we need to
        # rescale the penalty parameter if some observations are not
        # considered in SDCA. This is useful for Poisson regression with
        # identity link
        if self.link == "identity":
            scaled_l_l2sq = l_l2sq * self.n_samples / self._sdca_rand_max
        else:
            scaled_l_l2sq = l_l2sq

        primal_vector = np.empty(self.n_coeffs, dtype=self.dtype)
        self._model.sdca_primal_dual_relation(scaled_l_l2sq, dual_vector,
                                              primal_vector)
        return primal_vector

    @property
    def _sdca_rand_max(self):
        if self.link == "identity":
            non_zero_labels = self.labels != 0
            return non_zero_labels.sum().item()
        else:
            raise NotImplementedError()

[docs]    def dual_loss(self, dual_coeffs):
        """Computes the dual loss at the given dual coefficients

        Parameters
        ----------
        dual_coeffs : `np.ndarray`
            Dual coefficients

        Returns
        -------
        dual_loss : `float`
            The value of the dual loss
        """
        if self.link != "identity":
            raise (NotImplementedError())
        non_zero_labels = self.labels != 0
        dual_loss = self.labels[non_zero_labels] * (
            1 + np.log(dual_coeffs / self.labels[non_zero_labels]))
        dual_loss += np.mean(gammaln(self.labels[non_zero_labels] + 1))
        return np.mean(dual_loss) * self._sdca_rand_max / self.n_samples

    def _build_cpp_model(self, dtype_or_object_with_dtype):
        model_class = self._get_typed_class(dtype_or_object_with_dtype,
                                            dtype_map)
        return model_class(self.features, self.labels, self._link_type,
                           self.fit_intercept, self.n_threads)
Source code for tick.linear_model.model_poisreg