Source code for scorer.scorer.scorer

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import warnings

from scorer.lib.scorer import _scorer

__author__  = ['Nico Curti']
__email__   = ['nico.curti2@unibo.it']
__all__ = ['Scorer']


[docs]class Scorer (dict):
  '''
  Multi-class score computation.

  This class represents an optimized and extended version of
  the PyCM_ library.
  The full list of scores are evaluated using C++ functions
  wrapped into a single score object.
  The evaluation of the score functions can be performed into a
  parallel environment using OMP multhithreading.
  The C++ code is in fact auto-generated using the scripts provided
  into the utils_ directory and the optimal dependency graph is
  computed to allow the work distribution among the available threads.

  Example
  -------
  >>> from scorer import Scorer
  >>>
  >>> y_true = ['a', 'b', 'a', 'a', 'b', 'c', 'c', 'a', 'a', 'b', 'c', 'a']
  >>> y_pred = ['b', 'b', 'a', 'c', 'b', 'a', 'c', 'b', 'a', 'b', 'a', 'a']
  >>>
  >>> scorer = Scorer()
  >>> scorer.evaluate(y_true, y_pred)

  References
  ----------
  - Haghighi, S., Jasemi, M., Hessabi, S. and Zolanvari, A. (2018).
    PyCM: Multiclass confusion matrix library in Python.
    Journal of Open Source Software, 3(25), p.729.

  .. _PyCM : https://github.com/sepandhaghighi/pycm
  .. _utils : https://github.com/Nico-Curti/scorer/blob/master/utils/
  '''

[docs]  def __init__ (self):
    '''
    Default constructor
    '''

    self._obj = _scorer()
    super(Scorer, self).__init__({})

[docs]  def _check_params (self, true, pred):
    '''
    Check input dimension shapes

    Parameters
    ----------
      true : array-like
        True label array

      pred : array-like
        Predicted label array

    Notes
    -----
    .. note::
      The array of true labels and predicted ones mush have
      the same length.
      If the given arrays have different shapes a ValueError
      is raised.
    '''
    if len(true) != len(pred):
      class_name = self.__class__.__name__
      raise ValueError('{0}: wrong shapes found. '
        'Found {1} true labels and {2} predicted labels. '
        'Input arrays must have the same length'.format(
          class_name, len(true), len(pred)))


[docs]  def _label2numbers (self, arr):
    '''
    Convert labels to numerical values

    Parameters
    ----------
      arr : array_like
        The array of labels

    Returns
    -------
      numeric_labels : np.ndarray
        Array of numerical labels obtained by the
        LabelEncoder transform

    Notes
    -----
      .. note::
        The C++ function allows only numerical (integer)
        values as labels in input.
        For more general support refers to the C++ example.

    Examples
    --------
    >>> from scorer import scorer
    >>> y = ('A', 'A', 'B', 'B')
    >>> num_y = scorer()._label2numbers(y)
    >>> print(num_y)
      [0, 0, 1, 1]
    '''
    unique, numeric_labels = np.unique(arr, return_inverse=True)

    if unique.size <= 1:
      raise ValueError('The number of classes must be greater than 1')

    return numeric_labels.astype('int32'), unique

[docs]  def evaluate (self, lbl_true, lbl_pred):
    '''
    Evaluate scores of prediction labels vs true labels

    Parameters
    ----------
      lbl_true : array-like
        List of true labels

      lbl_pred : array-like
        List of predicted labels

    Returns
    -------
      self

    Examples
    --------
    >>> from scorer import scorer
    >>> size = 10
    >>> y_true = np.random.choice([0., 1.], p=[.5, .5], size=(size, ))
    >>> y_pred = np.random.choice([0., 1.], p=[.5, .5], size=(size, ))
    >>>
    >>> scorer = Scorer()
    >>> scorer.evaluate(y_true, y_pred)
    >>>
    >>> # Or using simple lists
    >>>
    >>> y_true = y_true.tolist()
    >>> y_pred = y_pred.tolist()
    >>>
    >>> scorer.evaluate(y_true, y_pred)

    Notes
    -----
    .. note::
      The score evaluation is possible only with integer labels.
      The input labels are encoded in integers using the C++
      version of the label encoder (_label2numbers).
    '''

    self._check_params(lbl_true, lbl_pred)

    y_true, true_names = self._label2numbers(lbl_true)
    y_pred, true_names = self._label2numbers(lbl_pred)

    # set contiguous order memory for c++ compatibility
    y_true = np.ascontiguousarray(y_true)
    y_pred = np.ascontiguousarray(y_pred)

    score = self._obj.evaluate(y_true, y_pred, len(y_true))
    self.update(score)

    with warnings.catch_warnings():
      warnings.simplefilter("ignore")
      self['Confusion Matrix'] = np.reshape(self['Confusion Matrix'],
                                            newshape=(len(self['Classes']),
                                                      len(self['Classes'])))
      self['Classes'] = true_names

    return self

  @property
  def score (self):
    '''
    Return the score list as dictionary.
    '''
    return self

  @property
  def num_classes (self):
    '''
    Return the number of classes identified.
    If the scores are not yet evaluated the return value is 0.
    '''
    return len(self['Classes']) if 'Classes' in self else 0

  @property
  def _get_available_metrics (self):
    '''
    Get the dictionary of available metrics in a more manageable format.
    '''
    return self._obj.get_available_metrics

[docs]  def __getattr__ (self, stat):
    '''
    Access to score stats as attribute

    Parameters
    ----------
      stat: name
        Name of the score

    Examples
    --------
    >>> from scorer import scorer
    >>> size = 10
    >>> y_true = np.random.choice([0., 1.], p=[.5, .5], size=(size, ))
    >>> y_pred = np.random.choice([0., 1.], p=[.5, .5], size=(size, ))
    >>>
    >>> scorer = Scorer()
    >>> scorer.evaluate(y_true, y_pred)
    >>> print(scorer.ACC, scorer.TP, scorer.FP)

    Notes
    -----
    .. note::
      In many cases the string related to the score is very
      long and it includes information about the mathematical
      meaning of that score.
      To facilitate the usage of the class the search of the
      attributes is performed using a "regex" search.
      In this way it is possible to access member values as in
      the following example

      .. code-block:: python

        np.testing.assert_allclose(scorer['ACC(Accuracy)'], scorer.ACC)
        np.testing.assert_allclose(scorer['FP(False positive/type 1 error/false alarm)'], scorer.FP)
        np.testing.assert_allclose(scorer['TOP(Test outcome positive)'], scorer.TOP)
        np.testing.assert_allclose(scorer['FDR(False discovery rate)'], scorer.FDR)

      If the attribute is not found an AttributeError is raised.
    '''

    if stat in self._get_available_metrics:
      stat = self._get_available_metrics[stat]
      return self[stat]

    elif stat in self:
      return self[stat]

    else:

      for x in sorted(self.keys()):
        y = x.split('(')[0]
        y = y.replace('%', '')
        y = y.replace(' ', '_')
        y = y.replace('-', '_')
        if y == stat:
          return super(Scorer, self).__getitem__(x)

      else:
        raise AttributeError('Attribute {} not found'.format(stat))

[docs]  def __getitem__ (self, stat):
    '''
    Get the value of the required score

    Parameters
    ----------
      stat: str
        Name of the score

    Examples
    --------
    >>> from scorer import scorer
    >>> size = 10
    >>> y_true = np.random.choice([0., 1.], p=[.5, .5], size=(size, ))
    >>> y_pred = np.random.choice([0., 1.], p=[.5, .5], size=(size, ))
    >>>
    >>> scorer = Scorer()
    >>> scorer.evaluate(y_true, y_pred)
    >>>
    >>> print(scorer['accuracy_score'])

    Notes
    -----
    .. note::
      The search of the score name is performed using the key
      name of the dictionary.
      This function is different from __getattr__.
    '''

    if not len(self):
      class_name = self.__class__.__name__
      raise ValueError('{0}: score not computed yet. '
                       'Please use the "evaluate" method before'.format(
                        class_name))

    try:

      return super(Scorer, self).__getitem__(stat)

    except KeyError:

      stat = self._get_available_metrics[stat]
      return self[stat]

    except KeyError:
      class_name = self.__class__.__name__
      raise KeyError('{0}: statistic not found. '
                     'Available statistics are {1}'.format(
                     class_name, ','.join(self.keys())))

[docs]  def __setitem__ (self, stat, values):
    '''
    Set a score variable.

    Parameters
    ----------
      stat: str
        Key as name of the new score

      values: float or list
        Value(s) of the new score

    Examples
    --------
    >>> from scorer import scorer
    >>> size = 10
    >>> y_true = np.random.choice([0., 1.], p=[.5, .5], size=(size, ))
    >>> y_pred = np.random.choice([0., 1.], p=[.5, .5], size=(size, ))
    >>>
    >>> scorer = Scorer()
    >>> scorer.evaluate(y_true, y_pred)
    >>>
    >>> scorer['dummy'] = 'dummy'
      UserWarning: Setting new statistics does not enable
      the computation of the dependencies
    '''
    warnings.warn(UserWarning('Setting new statistics does not enable'
                              'the computation of the dependencies'))
    super(Scorer, self).__setitem__(stat, values)

  def __repr__ (self):
    '''
    Object representation
    '''
    return str(self._obj)

[docs]  def __str__ (self):
    '''
    Print the object as table of scores
    '''
    fmt = ''

    fmt += 'Classes: {}\n'.format(', '.join(['{}'.format(c) for c in self['Classes']]))
    fmt += 'Confusion Matrix:\n'
    fmt += '\n'.join([''.join(['{:4}'.format(item) for item in row])
                      for row in self['Confusion Matrix']])

    fmt += '\n\nClass Statistics:\n\n'

    numeric_fmt = ' '.join(['{:>20.3f}' for _ in range(len(self['Classes']))])
    array_fmt   = ' '.join(['{:>20}'    for _ in range(len(self['Classes']))])

    for k, v in self.items():
      if isinstance(v, list) and k not in ['Classes', 'Confusion Matrix']:
        try:
          fmt += '{name:<80} {value}\n'.format(**{'name' : k,
                                                  'value' : numeric_fmt.format(*v)})

        except ValueError:
          fmt += '{name:<80} {value}\n'.format(**{'name' : k,
                                                  'value' : array_fmt.format(*v)})


    fmt += '\nOverall Statistics:\n\n'

    for k, v in self.items():
      if not isinstance(v, list) and k not in ['Classes', 'Confusion Matrix']:
        try:
          fmt += '{name:<80} {value:.3f}\n'.format(**{'name' : k, 'value' : v})
        except (ValueError, TypeError):
          fmt += '{name:<80} {value}\n'.format(**{'name' : k, 'value' : v})

    return fmt


if __name__ == '__main__':

  y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2]
  # y_true = np.array([2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2], dtype=np.int32)
  y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2]
  # y_pred = np.array([0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2], dtype=np.int32)

  scorer = Scorer()
  scorer.evaluate(y_true, y_pred)

  print(scorer)