Source code for machinevisiontoolbox.BagOfWords

#!/usr/bin/env python3

from collections import Counter
from machinevisiontoolbox.ImagePointFeatures import BaseFeature2D
import numpy as np
import cv2 as cv

# TODO: remove top N% and bottom M% of words by frequency

[docs]
class BagOfWords:
    def __init__(self, images, k=2_000, nstopwords=0, attempts=1, seed=None):
        r"""
        Bag of words class

        :param images: a sequence of images or set of image features
        :type images: :class:`~machinevisiontoolbox.Image` iterable, :class:`~machinevisiontoolbox.PointFeatures.BaseFeature2D`
        :param k: number of visual words, defaults to 2000
        :type k: int, optional
        :param nstopwords: number of stop words, defaults to 50
        :type nstopwords: int, optional
        :param attempts: number of k-means attempts, defaults to 1
        :type attempts: int, optional

        Bag of words is a powerful feature-based method for matching images
        from widely different viewpoints.

        This class creates a bag of words from a sequence of images or a set of
        point features.  In the former case, the features will have an ``.id``
        equal to the index of the image in the sequence.  For the latter case,
        features must have a valid ``.id`` attribute indicating which image in
        the bag they belong to.

        k-means clustering is performed to assign a word label to every feature.
        The cluster centroids are retained as a :math:`k \times N` array
        ``.centroids`` with one row per word centroid and each row is a feature
        descriptor, 128 elements long in the case of SIFT.

        ``.words`` is an array of word labels that corresponds to the array of
        image features ``.features``.  The word labels are integers, initially
        in the range [0, ``k``).

        Stop words are those visual words that occur most often and we can
        remove ``nstopwords`` of them. The centroids are reordered so that the
        last ``nstopwords`` rows correspond to the stop words.  When a new set
        of image features is assigned labels from the ``.centroids`` any with a
        label greater that ``.nstopwords`` is a stop word and can be discarded.

        :reference:
            - Video Google: a text retrieval approach to object matching in videos
              J.Sivic and A.Zisserman,
              in Proc. Ninth IEEE Int. Conf. on Computer Vision,
              pp.1470-1477, Oct. 2003.
            - Robotics, Vision & Control for Python, Section 12.4.2,
                P. Corke, Springer 2023.

        :seealso: :meth:`recall` :meth:`~machinevisiontoolbox.ImagePointFeatures.BaseFeature2D`
            :meth:`~machinevisiontoolbox.ImagePointFeatures.SIFT`
            `cv2.kmeans <https://docs.opencv.org/master/d5/d38/group__core__cluster.html#ga9a34dc06c6ec9460e90860f15bcd2f88>`_
        """

        if images is None:
            return

        if isinstance(images, BaseFeature2D):
            # passed the features
            features = images
        else:
            # passed images, compute the features
            features = []
            for image in images:
                features += image.SIFT()
        features.sort(by="scale", inplace=True)

        self._images = images

        # save the image id's
        self._image_id = np.r_[features.id]
        self._nimages = self._image_id.max() + 1
        self._features = features

        # do the clustering
        # NO IDEA WHAT EPSILON ACTUALLY MEANS, NORM OF THE SHIFT IN CENTROIDS?
        # NO IDEA HOW TO TELL WHAT CRITERIA IT TERMINATES ON
        criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 10, 1.0)

        if seed is not None:
            cv.setRNGSeed(seed)

        ret, labels, centroids = cv.kmeans(
            data=features._descriptor,
            K=k,
            bestLabels=None,
            criteria=criteria,
            attempts=attempts,
            flags=cv.KMEANS_RANDOM_CENTERS,
        )

        self._k = k

        self._words = labels.ravel()
        self._labels = labels.ravel()
        self._centroids = centroids
        self._word_freq_vectors = None

        self._nstopwords = nstopwords
        if nstopwords > 0:
            self._remove_stopwords()

        # compute word frequency vectors

        maxwords = self.k - self.nstopwords

        W = []
        id = np.array(self._features.id)
        for i in range(self.nimages):
            # get the words associated with image i
            words = self.words[id == i]

            # create columns of the W
            v = BagOfWords._word_freq_vector(words, maxwords)
            W.append(v)
        W = np.column_stack(W)

        N = self.nimages

        # total number of occurences of word i
        # multiple occurences in the one image count only as one
        ni = (W > 0).sum(axis=1)
        idf = np.log(N / ni)

        M = []

        for i in range(self.nimages):
            # number of words in this image
            nd = W[:, i].sum()

            # word occurrence frequency
            nid = W[:, i]

            with np.errstate(divide="ignore", invalid="ignore"):
                v = nid / nd * idf

            v[~np.isfinite(v)] = 0
            M.append(v)

        self._word_freq_vectors = np.column_stack(M)
        self._idf = idf


[docs]
    def wwfv(self, i=None):
        """
        Weighted word frequency vector for image

        :param i: image within bag, defaults to all images
        :type i: int, optional
        :return: word frequency vector or vectors
        :rtype: ndarray(K), ndarray(N,K)

        This is the word-frequency vector for the ``i``'th image in the bag. The
        angle between any two WFVs is an indication of image similarity.

        If ``i`` is None then the word-frequency matrix is returned, where the
        columns are the word-frequency vectors for the images in the bag.

        .. note:: The word vector is expensive to compute so a lazy evaluation
            is performed on the first call to this method.
        """
        if i is not None:
            v = self._word_freq_vectors[:, i]
            if v.ndim == 1:
                return np.c_[v]
        else:
            return self._word_freq_vectors


    @property
    def nimages(self):
        """
        Number of images associated in the bag

        :return: number of images
        :rtype: int
        """
        return self._nimages

    @property
    def images(self):
        """
        Images associated with this bag

        :return: images associated with this bag
        :rtype: :class:`~machinevisiontoolbox.Image` iterable

        .. note:: Only valid if the bag was constructed from images rather than features.
        """
        return self._images

    @property
    def k(self):
        """
        Number of words in the visual vocabulary

        :return: number of words
        :rtype: int

        :seealso: :meth:`nstopwords`
        """
        return self._k

    @property
    def words(self):
        """
        Word labels for every feature

        :return: word labels
        :rtype: ndarray(N)

        Word labels are arranged such that the top ``nstopwords`` labels are
        stop words.

        :seealso: :meth:`nstopwords`
        """
        return self._words

    # TODO better name for above


[docs]
    def word(self, f):
        """
        Word labels for original feature

        :return: word labels
        :rtype: ndarray(N)

        Word labels are arranged such that the top ``nstopwords`` labels
        """
        return self._labels[f]


    @property
    def nwords(self):
        """
        Number of usable words

        :return: number of usable words
        :rtype: int

        This is ``k`` - ``nstopwords``.

        :seealso: :meth:`k` :meth:`nstopwords`
        """
        return self._k - self._nstopwords

    @property
    def nstopwords(self):
        """
        Number of stop words

        :return: Number of stop words
        :rtype: int

        :seealso: :meth:`k` :meth:`nwords`
        """
        return self._nstopwords

    @property
    def firststop(self):
        """
        First stop word

        :return: word index of first stop word
        :rtype: int
        """
        return self.k - self._nstopwords

    @property
    def centroids(self):
        """
        Word feature centroids

        :return: centroids of visual word features
        :rtype: ndarray(k,N)

        Is an array with one row per visual word, and the row is the feature
        descriptor vector.  eg. for SIFT features it is 128 elements.

        Centroids are arranged such that the last ``nstopwords`` rows correspond
        to the stop words.  After clustering against the centroids, any word
        with a label ``>= nstopwords`` is a stop word.

        .. note:: The stop words are kept in the centroid array for the recall process.

        :seealso: :meth:`similarity`
        """
        return self._centroids

    def __repr__(self):
        return str(self)

    def __str__(self):
        s = f"BagOfWords: {len(self.words)} features from {self.nimages} images"
        s += f", {self.nwords} words, {self.nstopwords} stop words"
        return s

    def _remove_stopwords(self, verbose=True):
        # BagOfWords.remove_stop Remove stop words
        #
        # B.remove_stop(N) removes the N most frequent words (the stop words)
        # from the self.  All remaining words are renumbered so that the word
        # labels are consecutive.

        # words, freq = self.wordfreq()
        # index = np.argsort(-freq)  # sort descending order

        # # top ``nstopwords`` most frequent are the stop words
        # stopwords = words[index[:self._nstopwords]]

        unique_words, freq = self.wordfreq()

        # unique_words will be [0,k)
        index = np.argsort(-freq)  # sort descending order
        stopwords = unique_words[index[: self.nstopwords]]  # array of stop words

        stopfeatures = freq[stopwords].sum()
        print(
            f"Removing {stopfeatures} features ({stopfeatures/len(self.words) * 100:.1f}%) associated with {self.nstopwords} most frequent words"
        )

        k = np.full(index.shape, False, dtype=bool)
        k[stopwords] = True
        # k = freq > stop_cut  # index of all the stop words
        # indices of all non-stop words, followed by all stop words
        map = np.hstack((unique_words[~k], unique_words[k]))
        # create a dictionary from old label to new label
        # now all stop words have an index in the range [k-nstopwords, k)
        mapdict = {}
        for w in unique_words:
            mapdict[map[w]] = w

        # map the word labels
        words = np.array([mapdict[w] for w in self.words])

        self._labels = words

        # only retain the non stop words
        keep = words < self.nstopwords
        self._words = words[keep]
        self._image_id = self._image_id[keep]
        self._features = self._features[keep]

        # rearrange the cluster centroids
        self._centroids = self._centroids[map]


[docs]
    def similarity(self, arg):
        """
        Compute similarity between bag and query images

        :param other: bag of words
        :type other: BagOfWords
        :return: confusion matrix
        :rtype: ndarray(M,N)

        The array has rows corresponding to the images in ``self`` and
        columns corresponding to the images in ``other``.

        :seealso: :meth:`.closest`
        """
        if isinstance(arg, np.ndarray):
            wwfv = arg
            sim = np.empty((wwfv.shape[1], self.nimages))

            for j, vj in enumerate(wwfv.T):
                for i in range(self.nimages):
                    vi = self.wwfv(i)

                    with np.errstate(divide="ignore", invalid="ignore"):
                        sim[j, i] = np.dot(vi.ravel(), vj) / (
                            np.linalg.norm(vi) * np.linalg.norm(vj)
                        )
        else:
            images = arg
            if not hasattr(images, "__iter__"):
                # if not iterable like a FileCollection or VideoFile turn the image
                # into a list of 1
                images = [images]

            # similarity has bag index as column, query index as row
            sim = np.empty((len(images), self.nimages))
            for j, image in enumerate(images):
                features = image.SIFT(id="image")

                # assign features to given cluster centroids
                # the elements of matches are:
                #  queryIdx: new feature index
                #  trainingIdx: cluster centre index
                bfm = cv.BFMatcher(cv.NORM_L2, crossCheck=False)
                matches = bfm.match(features._descriptor, self._centroids)
                words = np.array([m.trainIdx for m in matches])

                keep = words < self.nstopwords
                words = words[keep]

                # word occurrence frequency
                nid = BagOfWords._word_freq_vector(words, self.k - self.nstopwords)

                # number of words in this image
                nd = nid.sum()

                with np.errstate(divide="ignore", invalid="ignore"):
                    v2 = nid / nd * self._idf

                v2[~np.isfinite(v2)] = 0

                for i in range(self.nimages):
                    v1 = self.wwfv(i).ravel()

                    with np.errstate(divide="ignore", invalid="ignore"):
                        sim[j, i] = np.dot(v1, v2) / (
                            np.linalg.norm(v1) * np.linalg.norm(v2)
                        )

        if sim.shape[0] == 1:
            sim = sim[0, :]
        return sim


    def retrieve(self, images):

        S = self.similarity(images).ravel()
        k = np.argmax(S)
        return k, S[k]


[docs]
    def features(self, word):
        """
        Get features corresponding to word

        :param word: visual word label
        :type word: int
        :return: features corresponding to this label
        :rtype: :class:`~machinevisiontoolbox.PointFeatures.BaseFeature2D`

        Return a slice of the image features corresponding to this word label.
        The ``.id`` attribute of each feature indicates which image in the bag
        it belongs to.
        """
        return self._features[self.words == word]



[docs]
    def occurrence(self, word):
        """
        Number of occurrences of specified word

        :param word: visual word label
        :type word: int
        :return: total number of times that visual ``word`` appears in this bag
        :rtype: int
        """
        return np.sum(self.words == word)


    @staticmethod
    def _word_freq_vector(words, maxwords):
        # create columns of the W
        unique, unique_counts = np.unique(words, return_counts=True)
        # [w,f] = count_unique(words)
        v = np.zeros((maxwords,))
        v[unique] = unique_counts
        return v


[docs]
    def wordfreq(self):
        """
        Get visual word frequency

        :return: visual words, visual word frequency
        :rtype: ndarray, ndarray

        Returns two arrays, one containing all visual words, the other containing
        the frequency of the corresponding word across all images.
        """

        # BagOfWords.wordfreq Word frequency statistics
        #
        # [W,N] = B.wordfreq[] is a vector of word labels W and the corresponding
        # elements of N are the number of occurrences of that word.
        return np.unique(self.words, return_counts=True)



[docs]
    def closest(self, S, i):
        """
        Find closest image

        :param S: bag similarity matrix
        :type S: ndarray(N,M)
        :param i: the query image index
        :type i: int
        :return: index of the recalled image and similarity
        :rtype: int, float

        :seealso: :meth:`similarity`
        """
        s = S[:, i]
        index = np.argsort(-s)

        return index, s[index]



[docs]
    def contains(self, word):
        """
        Images that contain specified word

        :param word: visual word label
        :type word: int
        :return: list of images containing this word
        :rtype: list

        :seealso: :meth:`exemplars`
        """
        return np.unique(self._image_id[self.words == word])



[docs]
    def exemplars(
        self, word, images=None, maxperimage=2, columns=10, max=None, width=50, **kwargs
    ):
        """
        Composite image containing exemplars of specified word

        :param word: visual word label
        :type word: int
        :param images: the set of images corresponding to this bag, only
            required if the bag was constructed from features not images.
        :param maxperimage: maximum number of exemplars drawn from any one image, defaults to 2
        :type maxperimage: int, optional
        :param columns: number of exemplar images in each row, defaults to 10
        :type columns: int, optional
        :param max: maximum number of exemplar images, defaults to None
        :type max: int, optional
        :param width: width of image thumbnail, defaults to 50
        :type width: int, optional
        :return: composite image
        :rtype: :class:`~machinevisiontoolbox.Image`

        Produces a grid of examples of a particular visual word.

        :seealso: :meth:`contains`
            :meth:`~machinevisiontoolbox.ImagePointFeatures.BaseFeature2D.support`
            :meth:`~machinevisiontoolbox.Image.Tile`
        """
        from machinevisiontoolbox import Image

        exemplars = []
        count = Counter()
        if images is None:
            images = self._images
        for feature in self.features(word):
            count[feature.id] += 1

            if count[feature.id] > maxperimage:
                continue

            exemplars.append(feature.support(images, width))
            if max is not None and len(exemplars) >= max:
                break

        return Image.Tile(exemplars, columns=columns, **kwargs)




if __name__ == "__main__":

    import numpy as np
    import matplotlib.pyplot as plt
    from machinevisiontoolbox import *
    import cv2 as cv

    cv.setRNGSeed(0)

    images = ImageCollection("campus/*.png", mono=True)

    features = []
    for image in images:
        features += image.SIFT()
    # sort them in descending order by strength
    features.sort(by="scale", inplace=True)

    features[:10].table()

    ex = []
    for i in range(400):
        ex.append(features[i].support(images))

    Image.Tile(ex, columns=20).disp(plain=True)

    feature = features[108]
    print(feature)

    bag = BagOfWords(features, 2_000)

    w = bag.word(108)
    print(w)
    print(bag.occurrence(w))
    print(bag.contains(w))

    bag.exemplars(w, images)

    bag = BagOfWords(images, 2_000)
    print(bag)

    w, f = bag.wordfreq()
    print(len(w))

    bag = BagOfWords(images, 2_000, nstopwords=50)
    print(bag)

    print(bag.wwfv(0).shape)
    print(bag.wwfv().shape)

    print(bag.similarity(bag.wwfv(3)))
    print(bag.similarity(images[:5]))

    sim_8 = bag.similarity(images[8]).ravel()
    print(sim_8)
    k = np.argsort(-sim_8)
    print(np.c_[sim_8[k], k])

    ss = []
    for i in range(4):
        ss.append(images[k[i]])
    Image.Tile(ss, columns=2).disp()

    holdout = ImageCollection("campus/holdout/*.png", mono=True)

    sim = bag.similarity(holdout)

    sim_2 = bag.similarity(holdout[2]).ravel()
    print(sim_2)
    k = np.argsort(-sim_2)
    print(np.c_[sim_2[k], k])

    ss = [holdout[2]]
    for i in range(3):
        ss.append(images[k[i]])
    Image.Tile(ss, columns=2).disp()

    Image(sim).disp(block=True)