Source code for mlpy.knowledgerep.cbr.similarity

from __future__ import division, print_function, absolute_import

import math
import numpy as np

from abc import ABCMeta, abstractmethod

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors.dist_metrics import METRIC_MAPPING


[docs]class Stat(object): """The similarity statistics container. The similarity statistics is a container to pass the calculated measure of similarity between the case identified by the case id and the query case between functions. Parameters ---------- case_id : int The case's id. similarity : float The similarity measure. """ __slots__ = ('_case_id', '_similarity') @property def case_id(self): """The case's id. Returns ------- int : The case's id """ return self._case_id @property def similarity(self): """The similarity measure. Returns ------- float : The similarity measure. """ return self._similarity def __init__(self, case_id, similarity=None): self._case_id = case_id self._similarity = similarity
[docs]class SimilarityFactory(object): """The similarity factory. An instance of a similarity model can be created by passing the similarity model type. Examples -------- >>> from mlpy.knowledgerep.cbr.similarity import SimilarityFactory >>> SimilarityFactory.create('float', **{}) """ @staticmethod
[docs] def create(_type, **kwargs): """ Create a feature of the given type. Parameters ---------- _type : str The feature type. Valid feature types are: knn A k-nearest-neighbor algorithm is used to determine similarity between cases (:class:`NeighborSimilarity`). The value ``n_neighbors`` must be specified. radius-n Similarity between cases is determined by the nearest neighbors within a radius (:class:`NeighborSimilarity`). The value ``radius`` must be specified. kmeans Similarity is determined by a KMeans clustering algorithm (:class:`KMeansSimilarity`). The value ``n_clusters`` must be specified. exact-match Only exact matches are considered similar (:class:`ExactMatchSimilarity`). cosine A cosine similarity measure is used to determine similarity between cases (:class:`CosineSimilarity`). kwargs : dict, optional Non-positional arguments to pass to the class of the given type for initialization. Returns ------- ISimilarity : A similarity instance of the given type. """ try: if _type == "knn": kwargs["n_neighbors"] = kwargs["method_params"] elif _type == "radius-n": kwargs["radius"] = kwargs["method_params"] elif _type == "kmeans": kwargs["n_cluster"] = kwargs["method_params"] elif _type == "cosine": kwargs["threshold"] = kwargs["method_params"] del kwargs["method_params"] return { "knn": NeighborSimilarity, "radius-n": NeighborSimilarity, "kmeans": KMeansSimilarity, "exact-match": ExactMatchSimilarity, "cosine": CosineSimilarity, }[_type](**kwargs) except KeyError: return None
[docs]class ISimilarity(object): """The similarity model interface. The similarity model keeps an internal indexing structure of the relevant case data to efficiently computing the similarity measure between data points. Notes ----- All similarity models must inherit from this class. """ __metaclass__ = ABCMeta def __init__(self): #: The indexing structure self._indexing_structure = None #: The mapping of the data points to their case ids self._id_map = None """:ivar: dict""" @abstractmethod
[docs] def build_indexing_structure(self, data, id_map): """Build the indexing structure. Parameters ---------- data : ndarray[ndarray[float]] The raw data points to be indexed. id_map : dict[int, int] The mapping from the data points to their case ids. Raises ------ NotImplementedError If the child class does not implement this function. """ raise NotImplementedError
@abstractmethod
[docs] def compute_similarity(self, data_point): """Computes the similarity. Computes the similarity between the data point and the data in the indexing structure returning the results in a collection of similarity statistics (:class:`Stat`). Parameters ---------- data_point : list[float] The raw data point to compare against the data points stored in the indexing structure. Returns ------- list[Stat] : A collection of similarity statistics. Raises ------ NotImplementedError If the child class does not implement this function. """ raise NotImplementedError
[docs]class NeighborSimilarity(ISimilarity): """The neighborhood similarity model. The neighbor similarity model determines similarity between the data in the indexing structure and the query data by using the nearest neighbor algorithm :class:`sklearn.neighbors.NearestNeighbors`. Both a k-neighbors classifier and a radius-neighbor-classifier are implemented. To choose between the classifiers either `n_neighbors` or `radius` must be specified. Parameters ---------- n_neighbors : int The number of data points considered to be closest neighbors. radius : int The radius around the query data point, within which the data points are considered closest neighbors. algorithm : str The internal indexing structure of the training data. Defaults to `kd-tree`. metric : str The metric used to compute the distances between pairs of points. Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers. Default is `euclidean`. metric_params : dict Parameters relevant to the specified metric. Raises ------ UserWarning : If the either both or none of `n_neighbors` and `radius` are given. See Also -------- :class:`sklearn.neighbors.KNeighborsClassifier`, :class:`sklearn.neighbors.RadiusNeighborsClassifier` """ def __init__(self, n_neighbors=None, radius=None, algorithm=None, metric=None, metric_params=None): super(NeighborSimilarity, self).__init__() if (n_neighbors is not None and radius is not None) or not (n_neighbors is None or radius is None): raise UserWarning("Exactly one of n_neighbors or radius must be initialized.") self._n_neighbors = n_neighbors self._radius = radius if algorithm is not None: if algorithm not in ["ball_tree", "kd_tree", "brute", "auto"]: raise ValueError("%s is not a valid retrieval algorithm" % algorithm) self._algorithm = algorithm else: self._algorithm = "kd_tree" if metric is not None: if metric not in METRIC_MAPPING: raise ValueError("%s is not a valid retrieval metric" % metric) self._metric = metric else: self._metric = "euclidean" self._metric_params = metric_params if metric_params is not None else 2
[docs] def build_indexing_structure(self, data, id_map): """Build the indexing structure. Build the indexing structure by fitting the data according to the specified algorithm. Parameters ---------- data : ndarray[ndarray[float]] The raw data points to be indexed. id_map : dict[int, int] The mapping from the data points to their case ids. """ self._id_map = id_map if self._n_neighbors is not None: self._indexing_structure = NearestNeighbors(n_neighbors=self._n_neighbors, algorithm=self._algorithm, metric=self._metric, p=self._metric_params).fit(data) else: self._indexing_structure = NearestNeighbors(radius=self._radius, algorithm=self._algorithm, metric=self._metric, p=self._metric_params).fit(data)
[docs] def compute_similarity(self, data_point): """Computes the similarity. Computes the similarity between the data point and the data in the indexing structure using the :class:`sklearn.neighbors.NearestNeighbors` algorithm. The results are returned in a collection of similarity statistics (:class:`Stat`). Parameters ---------- data_point : list[float] The raw data point to compare against the data points stored in the indexing structure. Returns ------- list[Stat] : A collection of similarity statistics. """ if self._n_neighbors is not None: # noinspection PyProtectedMember raw_data = self._indexing_structure._fit_X if len(raw_data) < self._n_neighbors: result = [] for i, feat in enumerate(raw_data): dist = np.linalg.norm(np.asarray(data_point) - np.asarray(feat)) result.append(Stat(self._id_map[i], dist)) # noinspection PyShadowingNames result = sorted(result, key=lambda x: x.similarity) else: d, key_lists = self._indexing_structure.kneighbors(data_point) result = [Stat(self._id_map[x], d[0][i]) for i, x in enumerate(key_lists[0])] else: d, key_lists = self._indexing_structure.radius_neighbors(data_point) result = [Stat(self._id_map[x], d[0][i]) for i, x in enumerate(key_lists[0])] return result
[docs]class KMeansSimilarity(ISimilarity): """The KMeans similarity model. The KMeans similarity model determines similarity between the data in the indexing structure and the query data by using the :class:`sklearn.cluster.KMeans` algorithm. Parameters ---------- n_cluster : int The number of clusters to fit the raw data in. """ def __init__(self, n_cluster=None): super(KMeansSimilarity, self).__init__() self._n_cluster = n_cluster if n_cluster is None else 8
[docs] def build_indexing_structure(self, data, id_map): """Build the indexing structure. Build the indexing structure by fitting the data into `n_cluster` clusters. Parameters ---------- data : ndarray[ndarray[float]] The raw data points to be indexed. id_map : dict[int, int] The mapping from the data points to their case ids. """ self._id_map = id_map self._indexing_structure = KMeans(init='k-means++', n_clusters=self._n_cluster, n_init=10).fit(data)
[docs] def compute_similarity(self, data_point): """Computes the similarity. Computes the similarity between the data point and the data in the indexing structure using the :class:`sklearn.cluster.KMeans` clustering algorithm. The results are returned in a collection of similarity statistics (:class:`Stat`). Parameters ---------- data_point : list[float] The raw data point to compare against the data points stored in the indexing structure. Returns ------- list[Stat] : A collection of similarity statistics. """ label = self._indexing_structure.predict(data_point) result = [] try: # noinspection PyTypeChecker,PyUnresolvedReferences key_lists = np.nonzero(self._indexing_structure.labels_ == label[0])[0] result = [Stat(self._id_map[x]) for x in key_lists] except IndexError: pass return result
[docs]class ExactMatchSimilarity(ISimilarity): """The exact match similarity model. The exact match similarity model considered only exact matches between the data in the indexing structure and the query data as similar. """ # noinspection PyUnusedLocal def __init__(self, **kwargs): super(ExactMatchSimilarity, self).__init__()
[docs] def build_indexing_structure(self, data, id_map): """Build the indexing structure. To determine exact matches a brute-force algorithm is used thus the data remains as is and no special indexing structure is implemented. Parameters ---------- data : ndarray[ndarray[float]] The raw data points to be indexed. id_map : dict[int, int] The mapping from the data points to their case ids. .. todo:: It might be worth looking into a more efficient way of determining exact matches. """ self._id_map = id_map self._indexing_structure = data
[docs] def compute_similarity(self, data_point): """Computes the similarity. Computes the similarity between the data point and the data in the indexing structure identifying exact matches. The results are returned in a collection of similarity statistics (:class:`Stat`). Parameters ---------- data_point : list[float] The raw data point to compare against the data points stored in the indexing structure. Returns ------- list[Stat] : A collection of similarity statistics. """ result = [] for i, feat in enumerate(self._indexing_structure): total = 0 for j, val in enumerate(data_point): total += math.pow(val - feat[j], 2) if total == 0.0: result.append(Stat(self._id_map[i])) return result
[docs]class CosineSimilarity(ISimilarity): """The cosine similarity model. Cosine similarity is a measure of similarity between two vectors of an inner product space that measures the cosine of the angle between them. The cosine of 0 degree is 1, and it is less than 1 for any other angle. It is thus a judgement of orientation and not magnitude: tow vectors with the same orientation have a cosine similarity of 1, two vectors at 90 degrees have a similarity of 0, and two vectors diametrically opposed have a similarity of -1, independent of their magnitude [1]_. The cosine model employs the `cosine_similarity <http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity>`_ function from the :mod:`sklearn.metrics.pairwise` module to determine similarity. .. seealso:: `Machine Learning::Cosine Similarity for Vector Space Models (Part III) <http://blog.christianperone.com/?p=2497>`_ References ---------- .. [1] `Wikipidia::cosine_similarity <https://en.wikipedia.org/wiki/Cosine_similarity>`_ """ # noinspection PyUnusedLocal def __init__(self, **kwargs): super(CosineSimilarity, self).__init__()
[docs] def build_indexing_structure(self, data, id_map): """Build the indexing structure. The cosine_similarity function from :mod:`sklearn.metrics.pairwise` takes the raw data as input. Thus the data remains as is and no special indexing structure is implemented. Parameters ---------- data : ndarray[ndarray[float]] The raw data points to be indexed. id_map : dict[int, int] The mapping from the data points to their case ids. """ self._id_map = id_map self._indexing_structure = data
[docs] def compute_similarity(self, data_point): """Computes the similarity. Computes the similarity between the data point and the data in the indexing structure using the function :func:`cosine_similarity` from :mod:`sklearn.metrics.pairwise`. The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, with 0 indicating orthogonality (decorrelation), and in-between values indicating intermediate similarity or dissimilarity. The results are returned in a collection of similarity statistics (:class:`Stat`). Parameters ---------- data_point : list[float] The raw data point to compare against the data points stored in the indexing structure. Returns ------- list[Stat] : A collection of similarity statistics. """ similarity = cosine_similarity(data_point, self._indexing_structure) if not np.any(data_point): similarity = np.array([[float(np.array_equal(data_point, m)) for m in np.array(self._indexing_structure)]]) return [Stat(self._id_map[i], x) for i, x in enumerate(similarity[0])]