# License: BSD 3 clause
import numpy as np
from scipy.special import gammaln
from tick.base_model import ModelGeneralizedLinear, ModelFirstOrder, \
ModelSecondOrder, ModelSelfConcordant
from .build.linear_model import ModelPoisRegDouble as _ModelPoisRegDouble
from .build.linear_model import ModelPoisRegFloat as _ModelPoisRegFloat
from .build.linear_model import LinkType_identity as identity
from .build.linear_model import LinkType_exponential as exponential
__author__ = 'Stephane Gaiffas'
dtype_map = {
np.dtype('float32'): _ModelPoisRegFloat,
np.dtype('float64'): _ModelPoisRegDouble
}
[docs]class ModelPoisReg(ModelGeneralizedLinear, ModelSecondOrder,
ModelSelfConcordant):
"""Poisson regression model with identity or exponential link for data with
a count label. This class gives first order and second order information for
this model (gradient, loss and hessian norm) and can be passed
to any solver through the solver's ``set_model`` method.
Can lead to overflows with some solvers (see the note below).
Given training data :math:`(x_i, y_i) \\in \\mathbb R^d \\times \\mathbb N`
for :math:`i=1, \\ldots, n`, this model considers a goodness-of-fit
.. math::
f(w, b) = \\frac 1n \\sum_{i=1}^n \\ell(y_i, b + x_i^\\top w),
where :math:`w \\in \\mathbb R^d` is a vector containing the model-weights,
:math:`b \\in \\mathbb R` is the intercept (used only whenever
``fit_intercept=True``) and
:math:`\\ell : \\mathbb R^2 \\rightarrow \\mathbb R` is the loss given by
.. math::
\\ell(y, y') = e^{y'} - y y'
whenever ``link='exponential'`` and
.. math::
\\ell(y, y') = y' - y \\log(y')
whenever ``link='identity'``, for :math:`y \in \mathbb N` and
:math:`y' \in \mathbb R`. Data is passed
to this model through the ``fit(X, y)`` method where X is the features
matrix (dense or sparse) and y is the vector of labels.
Parameters
----------
fit_intercept : `bool`
If `True`, the model uses an intercept
link : `str`, default="exponential"
Type of link function
* if ``"identity"``: the intensity is the inner product of the
model's coeffs with the features. In this case, one
must ensure that the intensity is non-negative
* if ``"exponential"``: the intensity is the exponential of the
inner product of the model's coeffs with the features.
Note that link cannot be changed after creation of
`ModelPoisReg`
Attributes
----------
features : {`numpy.ndarray`, `scipy.sparse.csr_matrix`}, shape=(n_samples, n_features)
The features matrix, either dense or sparse
labels : `numpy.ndarray`, shape=(n_samples,) (read-only)
The labels vector
n_samples : `int` (read-only)
Number of samples
n_features : `int` (read-only)
Number of features
n_coeffs : `int` (read-only)
Total number of coefficients of the model
n_threads : `int`, default=1 (read-only)
Number of threads used for parallel computation.
* if ``int <= 0``: the number of threads available on
the CPU
* otherwise the desired number of threads
dtype : `{'float64', 'float32'}`, default='float64'
Type of the data arrays used.
Notes
-----
The gradient and loss for the exponential link case cannot be
overflow proof. In this case, only a solver working in the dual
(such as `SDCA`) should be used.
In summary, use grad and call at your own risk when
``link="exponential"``
"""
_attrinfos = {
"_link_type": {
"writable": False
},
"_link": {
"writable": False
}
}
[docs] def __init__(self, fit_intercept: bool = True, link: str = "exponential",
n_threads: int = 1):
ModelSecondOrder.__init__(self)
ModelGeneralizedLinear.__init__(self, fit_intercept)
ModelSelfConcordant.__init__(self)
self._set("_link", None)
self.link = link
self.n_threads = n_threads
# TODO: implement _set_data and not fit
[docs] def fit(self, features, labels):
"""Set the data into the model object
Parameters
----------
features : {`numpy.ndarray`, `scipy.sparse.csr_matrix`}, shape=(n_samples, n_features)
The features matrix, either dense or sparse
labels : `numpy.ndarray`, shape=(n_samples,)
The labels vector
Returns
-------
output : `ModelPoisReg`
The current instance with given data
"""
ModelFirstOrder.fit(self, features, labels)
ModelGeneralizedLinear.fit(self, features, labels)
self._set("_model", self._build_cpp_model(features.dtype))
return self
def _grad(self, coeffs: np.ndarray, out: np.ndarray) -> None:
self._model.grad(coeffs, out)
def _loss(self, coeffs: np.ndarray) -> float:
return self._model.loss(coeffs)
@property
def link(self):
return self._link
@link.setter
def link(self, value):
if self._link is not None:
raise ValueError("link is read only")
if value == "exponential":
self._set("_link_type", exponential)
elif value == "identity":
self._set("_link_type", identity)
else:
raise ValueError("``link`` must be either 'exponential' or "
"'linear'.")
self._set("_link", value)
def _get_sc_constant(self) -> float:
"""Self-concordance parameter of the Poisson
regression loss
"""
if self.link == "identity":
y = self.labels
return 2 * (1. / np.sqrt(y[y > 0])).max()
else:
raise ValueError(("Poisson regression with exponential "
"link is not self-concordant"))
# TODO: C++ for this
def _hessian_norm(self, coeffs: np.ndarray, point: np.ndarray) -> float:
link = self.link
features, labels = self.features, self.labels
if link == "identity":
z1 = features.dot(coeffs)
z2 = features.dot(point)
# TODO: beware of zeros in z1 or z2 !
return np.sqrt((labels * z1 ** 2 / z2 ** 2).mean())
elif link == "exponential":
raise NotImplementedError("exp link is not yet implemented")
else:
raise ValueError("``link`` must be either 'exponential' or "
"'linear'.")
def _sdca_primal_dual_relation(self, l_l2sq, dual_vector):
# In order to solve the same problem than other solvers, we need to
# rescale the penalty parameter if some observations are not
# considered in SDCA. This is useful for Poisson regression with
# identity link
if self.link == "identity":
scaled_l_l2sq = l_l2sq * self.n_samples / self._sdca_rand_max
else:
scaled_l_l2sq = l_l2sq
primal_vector = np.empty(self.n_coeffs, dtype=self.dtype)
self._model.sdca_primal_dual_relation(scaled_l_l2sq, dual_vector,
primal_vector)
return primal_vector
@property
def _sdca_rand_max(self):
if self.link == "identity":
non_zero_labels = self.labels != 0
return non_zero_labels.sum().item()
else:
raise NotImplementedError()
[docs] def dual_loss(self, dual_coeffs):
"""Computes the dual loss at the given dual coefficients
Parameters
----------
dual_coeffs : `np.ndarray`
Dual coefficients
Returns
-------
dual_loss : `float`
The value of the dual loss
"""
if self.link != "identity":
raise (NotImplementedError())
non_zero_labels = self.labels != 0
dual_loss = self.labels[non_zero_labels] * (
1 + np.log(dual_coeffs / self.labels[non_zero_labels]))
dual_loss += np.mean(gammaln(self.labels[non_zero_labels] + 1))
return np.mean(dual_loss) * self._sdca_rand_max / self.n_samples
def _build_cpp_model(self, dtype_or_object_with_dtype):
model_class = self._get_typed_class(dtype_or_object_with_dtype,
dtype_map)
return model_class(self.features, self.labels, self._link_type,
self.fit_intercept, self.n_threads)