from __future__ import division, print_function, absolute_import
import math
import numpy as np
from abc import ABCMeta, abstractmethod
from ...auxiliary.misc import listify
[docs]class FeatureFactory(object):
"""The feature factory.
An instance of a feature can be created by passing
the feature type.
Examples
--------
>>> from mlpy.knowledgerep.cbr.features import FeatureFactory
>>> FeatureFactory.create('float', **{})
"""
@staticmethod
[docs] def create(_type, name, value, **kwargs):
"""Create a feature of the given type.
Parameters
----------
_type: str
The feature type. Valid feature types are:
bool
The feature values are boolean types (:class:`.BoolFeature`).
string
The feature values are of types sting (:class:`.StringFeature`).
int
The feature values are of type integer (:class:`.IntFeature`).
float
The feature values are of type float (:class:`.FloatFeature`).
kwargs : dict, optional
Non-positional arguments to pass to the class of the given type
for initialization.
Returns
-------
Feature :
A feature instance of the given type.
"""
try:
return {
"bool": BoolFeature,
"string": StringFeature,
"int": IntFeature,
"float": FloatFeature,
}[_type](name, value, **kwargs)
except KeyError:
return None
[docs]class Feature(object):
"""The abstract feature class.
A feature consists of one or more feature values.
Parameters
----------
name : str
The name of the feature (this also serves as the features identifying key).
value : bool or string or int or float or list
The feature value.
Other Parameters
----------------
weight : float or list[float]
The weights given to each feature value.
is_index : bool
Flag indicating whether this feature is an index.
retrieval_method : str
The similarity model used for retrieval. Refer to
:attr:`.Feature.retrieval_method` for valid methods.
retrieval_method_params : dict
Parameters relevant to the selected retrieval method.
retrieval_algorithm : str
The internal indexing structure of the training data. Refer
to :attr:`.Feature.retrieval_method` for valid algorithms.
retrieval_metric : str
The metric used to compute the distances between pairs of points.
Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers.
retrieval_metric_params : dict
Parameters relevant to the specified metric.
"""
__metaclass__ = ABCMeta
__slots__ = (
'_name', '_value', '_weight', '_is_index', '_retrieval_method', '_retrieval_method_params',
'_retrieval_algorithm', '_retrieval_metric', '_retrieval_metric_params')
@property
def name(self):
"""The name of the feature (this also serves as the features identifying key).
Returns
-------
str :
The name of the feature.
"""
return self._name
@property
def value(self):
"""The feature value.
Returns
-------
bool or string or int or float :
The feature value.
"""
return self._value
@property
def weight(self):
"""The weights given to each feature value
Returns
-------
float or list[float] :
The feature weights.
"""
return self._weight
@property
def is_index(self):
"""Flag indicating whether this feature is an index.
Returns
-------
bool :
Whether the feature is an index.
"""
return self._is_index
@property
def retrieval_method(self):
"""The similarity model used during retrieval.
Valid models are:
knn
A k-nearest-neighbor algorithm is used to determine similarity
between cases (:class:`NeighborSimilarity`). The value `k` must
be specified.
radius-n
Similarity between cases is determined by the nearest neighbors
within a radius (:class:`NeighborSimilarity`). The radius `n`
must be specified.
kmeans
Similarity is determined by a kmeans clustering algorithm
(:class:`KMeansSimilarity`).
exact-match
Only exact matches are considered similar (:class:`ExactMatchSimilarity`).
cosine
A cosine similarity measure is used to determine similarity between
cases (:class:`CosineSimilarity`).
Returns
-------
str :
The retrieval method.
"""
return self._retrieval_method
@property
def retrieval_method_params(self):
"""Parameters relevant to the specified retrieval method.
Returns
-------
dict :
Retrieval parameters.
"""
return self._retrieval_method_params
@property
def retrieval_algorithm(self):
"""The internal indexing structure of the training data.
The retrieval algorithm is only relevant for :class:`NeighborSimilarity`.
Valid algorithms are:
ball_tree
A ball tree data structure is used for computational efficiency of
the calculation of the distances between pairs of points.
kd_tree
A K-D Tree data structure is used for computational efficiency of
the calculation of the distances between pairs of points.
brute
The nearest neighbors are determined by brute-force computation of
distances between all pairs of points in the dataset.
auto
When ``auto`` is passed, the algorithm attempts to determine the best
approach from the training data.
Returns
-------
str :
The retrieval algorithm.
"""
return self._retrieval_algorithm
@property
def retrieval_metric(self):
"""The metric used to compute the distances between pairs of points.
The retrieval metric is only relevant for :class:`NeighborSimilarity`.
Refer to :class:`sklearn.neighbors.DistanceMetric` for valid metric
identifiers.
Returns
-------
str :
The retrieval metric.
"""
return self._retrieval_metric
@property
def retrieval_metric_params(self):
"""Parameters relevant to the specified metric.
Returns
-------
dict :
The retrieval metric parameters.
"""
return self._retrieval_metric_params
def __init__(self, name, value, **kwargs):
self._name = name
self._value = value
self._weight = kwargs["weight"] if "weight" in kwargs else 1.0
""":ivar: float"""
self._is_index = kwargs["is_index"] if "is_index" in kwargs else True
""":ivar: bool"""
if "retrieval_method" in kwargs:
if kwargs["retrieval_method"] not in ["knn", "radius-n", "kmeans", "exact-match", "cosine"]:
raise ValueError("%s is not a valid retrieval method" % kwargs["retrieval_method"])
self._retrieval_method = kwargs["retrieval_method"]
else:
self._retrieval_method = "knn"
self._retrieval_method_params = kwargs[
"retrieval_method_params"] if "retrieval_method_params" in kwargs else None
if "retrieval_algorithm" in kwargs:
if kwargs["retrieval_algorithm"] not in ["ball_tree", "kd_tree", "brute", "auto"]:
raise ValueError("%s is not a valid retrieval algorithm" % kwargs["retrieval_algorithm"])
self._retrieval_algorithm = kwargs["retrieval_algorithm"]
else:
self._retrieval_algorithm = "kd_tree"
if "retrieval_metric" in kwargs:
if kwargs["retrieval_metric"] not in ["euclidean", "minkowski", "manhattan", "chebyshev", "wminkowski"
"seucliden",
"mahalanobis"]:
raise ValueError("%s is not a valid retrieval metric" % kwargs["retrieval_metric"])
self._retrieval_metric = kwargs["retrieval_metric"]
else:
self._retrieval_metric = "minkowski"
self._retrieval_metric_params = kwargs["retrieval_metric_params"] if "retrieval_metric_params" in kwargs else 2
@abstractmethod
[docs] def compare(self, other):
"""Compare this feature to another feature.
Parameters
----------
other : Feature
The other feature to compare this feature to.
Returns
-------
float :
The similarity metric.
Raises
------
NotImplementedError:
If the child class does not implement this function.
"""
raise NotImplementedError
[docs]class BoolFeature(Feature):
"""The boolean feature.
The boolean feature is represented by a scalar.
Parameters
----------
name : str
The name of the feature (this also serves as the features identifying key).
value : bool or string or int or float or list
The feature value.
Other Parameters
----------------
weight : float or list[float]
The weights given to each feature value.
is_index : bool
Flag indicating whether this feature is an index.
retrieval_method : str
The similarity model used for retrieval. Refer to
:attr:`.Feature.retrieval_method` for valid methods.
retrieval_method_params : dict
Parameters relevant to the selected retrieval method.
retrieval_algorithm : str
The internal indexing structure of the training data. Refer
to :attr:`.Feature.retrieval_method` for valid algorithms.
retrieval_metric : str
The metric used to compute the distances between pairs of points.
Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers.
retrieval_metric_params : dict
Parameters relevant to the specified metric.
Raises
------
ValueError :
If the feature values is not of type `boolean`.
"""
def __init__(self, name, value, **kwargs):
super(BoolFeature, self).__init__(name, value, **kwargs)
if not isinstance(value, bool):
raise ValueError("The feature value is not of type `bool`.")
[docs] def compare(self, other):
"""Compare this feature to another feature.
The strings are compared directly and receive a similarity measure
of `1` if they are the same, `0` otherwise.
Parameters
----------
other : Feature
The other feature to compare this feature to.
Returns
-------
float :
The similarity metric.
"""
if self.value == other.value:
return 1.0
return 0.0
[docs]class StringFeature(Feature):
"""The string feature.
The string feature is represented by a scalar.
Parameters
----------
name : str
The name of the feature (this also serves as the features identifying key).
value : bool or string or int or float or list
The feature value.
Other Parameters
----------------
weight : float or list[float]
The weights given to each feature value.
is_index : bool
Flag indicating whether this feature is an index.
retrieval_method : str
The similarity model used for retrieval. Refer to
:attr:`.Feature.retrieval_method` for valid methods.
retrieval_method_params : dict
Parameters relevant to the selected retrieval method.
retrieval_algorithm : str
The internal indexing structure of the training data. Refer
to :attr:`.Feature.retrieval_method` for valid algorithms.
retrieval_metric : str
The metric used to compute the distances between pairs of points.
Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers.
retrieval_metric_params : dict
Parameters relevant to the specified metric.
Raises
------
ValueError
If the feature values is not of type `string`.
"""
def __init__(self, name, value, **kwargs):
super(StringFeature, self).__init__(name, value, **kwargs)
if not isinstance(value, basestring):
raise ValueError("The feature value is not of type `string`.")
[docs] def compare(self, other):
"""Compare this feature to another feature.
The strings are compared directly and receive a similarity measure
of `1` if they are the same, `0` otherwise.
Parameters
----------
other : Feature
The other feature to compare this feature to.
Returns
-------
float :
The similarity metric.
"""
if self.value == other.value:
return 1.0
return 0.0
[docs]class IntFeature(Feature):
"""The integer feature.
The integer feature is either represented by a scalar
or by a list or values.
Parameters
----------
name : str
The name of the feature (this also serves as the features identifying key).
value : bool or string or int or float or list
The feature value.
Other Parameters
----------------
weight : float or list[float]
The weights given to each feature value.
is_index : bool
Flag indicating whether this feature is an index.
retrieval_method : str
The similarity model used for retrieval. Refer to
:attr:`.Feature.retrieval_method` for valid methods.
retrieval_method_params : dict
Parameters relevant to the selected retrieval method.
retrieval_algorithm : str
The internal indexing structure of the training data. Refer
to :attr:`.Feature.retrieval_method` for valid algorithms.
retrieval_metric : str
The metric used to compute the distances between pairs of points.
Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers.
retrieval_metric_params : dict
Parameters relevant to the specified metric.
Raises
------
ValueError
If not all feature values are of type `integer`.
"""
def __init__(self, name, value, **kwargs):
super(IntFeature, self).__init__(name, value, **kwargs)
if isinstance(value, (np.ndarray, list)):
for v in value:
if not isinstance(v, (int, long)):
raise ValueError("The feature value is not of type `integer`.")
[docs] def compare(self, other):
"""Compare this feature to another feature.
If the feature is represented by a list the similarity
between the two features is determined by the Euclidean
distance of the feature values.
Parameters
----------
other : Feature
The other feature to compare this feature to.
Returns
-------
float :
The similarity metric.
"""
if isinstance(self._value, (int, long)):
return self.value - other.value
assert len(self._value) == len(other.value), "Features don't match"
total_similarity = 0.0
for i, val in enumerate(listify(self._value)):
total_similarity += math.pow(val - other.value[i], 2)
return math.sqrt(total_similarity)
[docs]class FloatFeature(Feature):
"""The float feature.
The float feature is either represented by a scalar
or by a list or values.
Parameters
----------
name : str
The name of the feature (this also serves as the features identifying key).
value : bool or string or int or float or list
The feature value.
Other Parameters
----------------
weight : float or list[float]
The weights given to each feature value.
is_index : bool
Flag indicating whether this feature is an index.
retrieval_method : str
The similarity model used for retrieval. Refer to
:attr:`.Feature.retrieval_method` for valid methods.
retrieval_method_params : dict
Parameters relevant to the selected retrieval method.
retrieval_algorithm : str
The internal indexing structure of the training data. Refer
to :attr:`.Feature.retrieval_method` for valid algorithms.
retrieval_metric : str
The metric used to compute the distances between pairs of points.
Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers.
retrieval_metric_params : dict
Parameters relevant to the specified metric.
Raises
------
ValueError
If not all feature values are of type `float`.
"""
def __init__(self, name, value, **kwargs):
super(FloatFeature, self).__init__(name, value, **kwargs)
if isinstance(value, (np.ndarray, list)):
for v in value:
if not isinstance(v, float):
raise ValueError("The feature value is not of type `float`.")
[docs] def compare(self, other):
"""Compare this feature to another feature.
If the feature is represented by a list the similarity
between the two features is determined by the Euclidean
distance of the feature values.
Parameters
----------
other : Feature
The other feature to compare this feature to.
Returns
-------
float :
The similarity metric.
"""
if isinstance(self._value, float):
return self.value - other.value
assert len(self._value) == len(other.value), "Features don't match"
total_similarity = 0.0
for i, val in enumerate(listify(self._value)):
total_similarity += math.pow(val - other.value[i], 2)
return math.sqrt(total_similarity)