Source code for mlpy.knowledgerep.cbr.features

from __future__ import division, print_function, absolute_import

import math
import numpy as np

from abc import ABCMeta, abstractmethod

from ...auxiliary.misc import listify


[docs]class FeatureFactory(object): """The feature factory. An instance of a feature can be created by passing the feature type. Examples -------- >>> from mlpy.knowledgerep.cbr.features import FeatureFactory >>> FeatureFactory.create('float', **{}) """ @staticmethod
[docs] def create(_type, name, value, **kwargs): """Create a feature of the given type. Parameters ---------- _type: str The feature type. Valid feature types are: bool The feature values are boolean types (:class:`.BoolFeature`). string The feature values are of types sting (:class:`.StringFeature`). int The feature values are of type integer (:class:`.IntFeature`). float The feature values are of type float (:class:`.FloatFeature`). kwargs : dict, optional Non-positional arguments to pass to the class of the given type for initialization. Returns ------- Feature : A feature instance of the given type. """ try: return { "bool": BoolFeature, "string": StringFeature, "int": IntFeature, "float": FloatFeature, }[_type](name, value, **kwargs) except KeyError: return None
[docs]class Feature(object): """The abstract feature class. A feature consists of one or more feature values. Parameters ---------- name : str The name of the feature (this also serves as the features identifying key). value : bool or string or int or float or list The feature value. Other Parameters ---------------- weight : float or list[float] The weights given to each feature value. is_index : bool Flag indicating whether this feature is an index. retrieval_method : str The similarity model used for retrieval. Refer to :attr:`.Feature.retrieval_method` for valid methods. retrieval_method_params : dict Parameters relevant to the selected retrieval method. retrieval_algorithm : str The internal indexing structure of the training data. Refer to :attr:`.Feature.retrieval_method` for valid algorithms. retrieval_metric : str The metric used to compute the distances between pairs of points. Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers. retrieval_metric_params : dict Parameters relevant to the specified metric. """ __metaclass__ = ABCMeta __slots__ = ( '_name', '_value', '_weight', '_is_index', '_retrieval_method', '_retrieval_method_params', '_retrieval_algorithm', '_retrieval_metric', '_retrieval_metric_params') @property def name(self): """The name of the feature (this also serves as the features identifying key). Returns ------- str : The name of the feature. """ return self._name @property def value(self): """The feature value. Returns ------- bool or string or int or float : The feature value. """ return self._value @property def weight(self): """The weights given to each feature value Returns ------- float or list[float] : The feature weights. """ return self._weight @property def is_index(self): """Flag indicating whether this feature is an index. Returns ------- bool : Whether the feature is an index. """ return self._is_index @property def retrieval_method(self): """The similarity model used during retrieval. Valid models are: knn A k-nearest-neighbor algorithm is used to determine similarity between cases (:class:`NeighborSimilarity`). The value `k` must be specified. radius-n Similarity between cases is determined by the nearest neighbors within a radius (:class:`NeighborSimilarity`). The radius `n` must be specified. kmeans Similarity is determined by a kmeans clustering algorithm (:class:`KMeansSimilarity`). exact-match Only exact matches are considered similar (:class:`ExactMatchSimilarity`). cosine A cosine similarity measure is used to determine similarity between cases (:class:`CosineSimilarity`). Returns ------- str : The retrieval method. """ return self._retrieval_method @property def retrieval_method_params(self): """Parameters relevant to the specified retrieval method. Returns ------- dict : Retrieval parameters. """ return self._retrieval_method_params @property def retrieval_algorithm(self): """The internal indexing structure of the training data. The retrieval algorithm is only relevant for :class:`NeighborSimilarity`. Valid algorithms are: ball_tree A ball tree data structure is used for computational efficiency of the calculation of the distances between pairs of points. kd_tree A K-D Tree data structure is used for computational efficiency of the calculation of the distances between pairs of points. brute The nearest neighbors are determined by brute-force computation of distances between all pairs of points in the dataset. auto When ``auto`` is passed, the algorithm attempts to determine the best approach from the training data. Returns ------- str : The retrieval algorithm. """ return self._retrieval_algorithm @property def retrieval_metric(self): """The metric used to compute the distances between pairs of points. The retrieval metric is only relevant for :class:`NeighborSimilarity`. Refer to :class:`sklearn.neighbors.DistanceMetric` for valid metric identifiers. Returns ------- str : The retrieval metric. """ return self._retrieval_metric @property def retrieval_metric_params(self): """Parameters relevant to the specified metric. Returns ------- dict : The retrieval metric parameters. """ return self._retrieval_metric_params def __init__(self, name, value, **kwargs): self._name = name self._value = value self._weight = kwargs["weight"] if "weight" in kwargs else 1.0 """:ivar: float""" self._is_index = kwargs["is_index"] if "is_index" in kwargs else True """:ivar: bool""" if "retrieval_method" in kwargs: if kwargs["retrieval_method"] not in ["knn", "radius-n", "kmeans", "exact-match", "cosine"]: raise ValueError("%s is not a valid retrieval method" % kwargs["retrieval_method"]) self._retrieval_method = kwargs["retrieval_method"] else: self._retrieval_method = "knn" self._retrieval_method_params = kwargs[ "retrieval_method_params"] if "retrieval_method_params" in kwargs else None if "retrieval_algorithm" in kwargs: if kwargs["retrieval_algorithm"] not in ["ball_tree", "kd_tree", "brute", "auto"]: raise ValueError("%s is not a valid retrieval algorithm" % kwargs["retrieval_algorithm"]) self._retrieval_algorithm = kwargs["retrieval_algorithm"] else: self._retrieval_algorithm = "kd_tree" if "retrieval_metric" in kwargs: if kwargs["retrieval_metric"] not in ["euclidean", "minkowski", "manhattan", "chebyshev", "wminkowski" "seucliden", "mahalanobis"]: raise ValueError("%s is not a valid retrieval metric" % kwargs["retrieval_metric"]) self._retrieval_metric = kwargs["retrieval_metric"] else: self._retrieval_metric = "minkowski" self._retrieval_metric_params = kwargs["retrieval_metric_params"] if "retrieval_metric_params" in kwargs else 2 @abstractmethod
[docs] def compare(self, other): """Compare this feature to another feature. Parameters ---------- other : Feature The other feature to compare this feature to. Returns ------- float : The similarity metric. Raises ------ NotImplementedError: If the child class does not implement this function. """ raise NotImplementedError
[docs]class BoolFeature(Feature): """The boolean feature. The boolean feature is represented by a scalar. Parameters ---------- name : str The name of the feature (this also serves as the features identifying key). value : bool or string or int or float or list The feature value. Other Parameters ---------------- weight : float or list[float] The weights given to each feature value. is_index : bool Flag indicating whether this feature is an index. retrieval_method : str The similarity model used for retrieval. Refer to :attr:`.Feature.retrieval_method` for valid methods. retrieval_method_params : dict Parameters relevant to the selected retrieval method. retrieval_algorithm : str The internal indexing structure of the training data. Refer to :attr:`.Feature.retrieval_method` for valid algorithms. retrieval_metric : str The metric used to compute the distances between pairs of points. Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers. retrieval_metric_params : dict Parameters relevant to the specified metric. Raises ------ ValueError : If the feature values is not of type `boolean`. """ def __init__(self, name, value, **kwargs): super(BoolFeature, self).__init__(name, value, **kwargs) if not isinstance(value, bool): raise ValueError("The feature value is not of type `bool`.")
[docs] def compare(self, other): """Compare this feature to another feature. The strings are compared directly and receive a similarity measure of `1` if they are the same, `0` otherwise. Parameters ---------- other : Feature The other feature to compare this feature to. Returns ------- float : The similarity metric. """ if self.value == other.value: return 1.0 return 0.0
[docs]class StringFeature(Feature): """The string feature. The string feature is represented by a scalar. Parameters ---------- name : str The name of the feature (this also serves as the features identifying key). value : bool or string or int or float or list The feature value. Other Parameters ---------------- weight : float or list[float] The weights given to each feature value. is_index : bool Flag indicating whether this feature is an index. retrieval_method : str The similarity model used for retrieval. Refer to :attr:`.Feature.retrieval_method` for valid methods. retrieval_method_params : dict Parameters relevant to the selected retrieval method. retrieval_algorithm : str The internal indexing structure of the training data. Refer to :attr:`.Feature.retrieval_method` for valid algorithms. retrieval_metric : str The metric used to compute the distances between pairs of points. Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers. retrieval_metric_params : dict Parameters relevant to the specified metric. Raises ------ ValueError If the feature values is not of type `string`. """ def __init__(self, name, value, **kwargs): super(StringFeature, self).__init__(name, value, **kwargs) if not isinstance(value, basestring): raise ValueError("The feature value is not of type `string`.")
[docs] def compare(self, other): """Compare this feature to another feature. The strings are compared directly and receive a similarity measure of `1` if they are the same, `0` otherwise. Parameters ---------- other : Feature The other feature to compare this feature to. Returns ------- float : The similarity metric. """ if self.value == other.value: return 1.0 return 0.0
[docs]class IntFeature(Feature): """The integer feature. The integer feature is either represented by a scalar or by a list or values. Parameters ---------- name : str The name of the feature (this also serves as the features identifying key). value : bool or string or int or float or list The feature value. Other Parameters ---------------- weight : float or list[float] The weights given to each feature value. is_index : bool Flag indicating whether this feature is an index. retrieval_method : str The similarity model used for retrieval. Refer to :attr:`.Feature.retrieval_method` for valid methods. retrieval_method_params : dict Parameters relevant to the selected retrieval method. retrieval_algorithm : str The internal indexing structure of the training data. Refer to :attr:`.Feature.retrieval_method` for valid algorithms. retrieval_metric : str The metric used to compute the distances between pairs of points. Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers. retrieval_metric_params : dict Parameters relevant to the specified metric. Raises ------ ValueError If not all feature values are of type `integer`. """ def __init__(self, name, value, **kwargs): super(IntFeature, self).__init__(name, value, **kwargs) if isinstance(value, (np.ndarray, list)): for v in value: if not isinstance(v, (int, long)): raise ValueError("The feature value is not of type `integer`.")
[docs] def compare(self, other): """Compare this feature to another feature. If the feature is represented by a list the similarity between the two features is determined by the Euclidean distance of the feature values. Parameters ---------- other : Feature The other feature to compare this feature to. Returns ------- float : The similarity metric. """ if isinstance(self._value, (int, long)): return self.value - other.value assert len(self._value) == len(other.value), "Features don't match" total_similarity = 0.0 for i, val in enumerate(listify(self._value)): total_similarity += math.pow(val - other.value[i], 2) return math.sqrt(total_similarity)
[docs]class FloatFeature(Feature): """The float feature. The float feature is either represented by a scalar or by a list or values. Parameters ---------- name : str The name of the feature (this also serves as the features identifying key). value : bool or string or int or float or list The feature value. Other Parameters ---------------- weight : float or list[float] The weights given to each feature value. is_index : bool Flag indicating whether this feature is an index. retrieval_method : str The similarity model used for retrieval. Refer to :attr:`.Feature.retrieval_method` for valid methods. retrieval_method_params : dict Parameters relevant to the selected retrieval method. retrieval_algorithm : str The internal indexing structure of the training data. Refer to :attr:`.Feature.retrieval_method` for valid algorithms. retrieval_metric : str The metric used to compute the distances between pairs of points. Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers. retrieval_metric_params : dict Parameters relevant to the specified metric. Raises ------ ValueError If not all feature values are of type `float`. """ def __init__(self, name, value, **kwargs): super(FloatFeature, self).__init__(name, value, **kwargs) if isinstance(value, (np.ndarray, list)): for v in value: if not isinstance(v, float): raise ValueError("The feature value is not of type `float`.")
[docs] def compare(self, other): """Compare this feature to another feature. If the feature is represented by a list the similarity between the two features is determined by the Euclidean distance of the feature values. Parameters ---------- other : Feature The other feature to compare this feature to. Returns ------- float : The similarity metric. """ if isinstance(self._value, float): return self.value - other.value assert len(self._value) == len(other.value), "Features don't match" total_similarity = 0.0 for i, val in enumerate(listify(self._value)): total_similarity += math.pow(val - other.value[i], 2) return math.sqrt(total_similarity)