#!/usr/bin/env python
"""
SIFT feature class
@author: Dorian Tsai
@author: Peter Corke
"""
# https://docs.opencv.org/4.4.0/d7/d60/classcv_1_1SIFT.html
import numpy as np
import math
import cv2 as cv
import matplotlib.pyplot as plt
from ansitable import ANSITable, Column
from spatialmath import SE3
from machinevisiontoolbox.ImagePointFeatures import BaseFeature2D
def array_result(func):
def innerfunc(*args):
out = func(*args)
if len(out) == 1:
return out[0]
else:
return out
inner = innerfunc
inner.__doc__ = func.__doc__ # pass through the doc string
return inner
def _fiducial_dict(dict="4x4_1000"):
tag_dict = {
"4x4_50": cv.aruco.DICT_4X4_50,
"4x4_100": cv.aruco.DICT_4X4_100,
"4x4_250": cv.aruco.DICT_4X4_250,
"4x4_1000": cv.aruco.DICT_4X4_1000,
"5x5_50": cv.aruco.DICT_5X5_50,
"5x5_100": cv.aruco.DICT_5X5_100,
"5x5_250": cv.aruco.DICT_5X5_250,
"5x5_1000": cv.aruco.DICT_5X5_1000,
"6x6_50": cv.aruco.DICT_6X6_50,
"6x6_100": cv.aruco.DICT_6X6_100,
"6x6_250": cv.aruco.DICT_6X6_250,
"6x6_1000": cv.aruco.DICT_6X6_1000,
"7x7_50": cv.aruco.DICT_7X7_50,
"7x7_100": cv.aruco.DICT_7X7_100,
"7x7_250": cv.aruco.DICT_7X7_250,
"7x7_1000": cv.aruco.DICT_7X7_1000,
"original": cv.aruco.DICT_ARUCO_ORIGINAL,
"16h5": cv.aruco.DICT_APRILTAG_16h5,
"25h9": cv.aruco.DICT_APRILTAG_25h9,
"36h10": cv.aruco.DICT_APRILTAG_36h10,
"36h11": cv.aruco.DICT_APRILTAG_36h11,
}
if isinstance(dict, str):
return cv.aruco.getPredefinedDictionary(tag_dict[dict])
else:
return dict
class ImageRegionFeaturesMixin:
def MSER(self, **kwargs):
"""
Find MSER features in image
:param kwargs: arguments passed to ``opencv.MSER_create``
:return: set of MSER features
:rtype: :class:`MSERFeature`
Find all the maximally stable extremal regions in the image and
return an object that represents the MSERs found. The object behaves
like a list and can be indexed, sliced and used as an iterator in
for loops and comprehensions.
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> img = Image.Read("castle.png")
>>> mser = img.MSER()
>>> len(mser) # number of features
>>> mser[:5].bbox
:references:
- Robotics, Vision & Control for Python, Section 12.1.1.2, P. Corke, Springer 2023.
:seealso: :class:`MSERFeature`, `cv2.MSER_create <https://docs.opencv.org/4.5.2/d3/d28/classcv_1_1MSER.html>`_
"""
return MSERFeature(self, **kwargs)
def ocr(self, minconf=50, plot=False):
"""
Optical character recognition
:param minconf: minimum confidence value for text to be returned or
plotted (percentage), defaults to 50
:type minconf: int, optional
:param plot: overlay detected text on the current plot, assumed to be the
image, defaults to False
:type plot: bool, optional
:return: detected strings and metadata
:rtype: list of :class:`OCRWord`
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> img = Image.Read('penguins.png')
>>> for word in img.ocr(minconf=90):
>>> print(word)
Each recognized text string is described by an :class:`OCRWord` instance
that contains the string, confidence and bounding box within the image.
.. warning:: `PyTessearct <https://github.com/madmaze/pytesseract>`_ must be installed.
:references:
- Robotics, Vision & Control for Python, Section 12.4.1, P. Corke, Springer 2023.
:seealso: :class:`OCRWord`
"""
#
try:
import pytesseract
except:
print("you need to install pytesseract:")
return
ocr = pytesseract.image_to_data(self.A, output_type=pytesseract.Output.DICT)
# create list of dicts, rather than dict of lists
n = len(ocr["conf"])
words = []
for i in range(n):
conf = ocr["conf"][i]
if conf == "-1": # I suspect this was not meant to be a string
continue
if conf < minconf:
continue
word = OCRWord(ocr, i)
if plot:
word.plot()
words.append(word)
return words
def fiducial(self, dict="4x4_1000", K=None, side=None):
"""
Find fiducial markers in image
:param dict: marker type, defaults to "4x4_1000"
:type dict: str, optional
:param K: camera intrinsics, defaults to None
:type K: ndarray(3,3), optional
:param side: side length of the marker, defaults to None
:type side: float, optional
:return: markers found in image
:rtype: list of :class:`Fiducial` instances
Find ArUco or ApriTag markers in the scene and return a list of
:class:`Fiducial` objects, one per marker. If camera intrinsics are
provided then also compute the marker pose with respect to the camera.
``dict`` specifies the marker family or dictionary and describes the
number of bits in the tag and the number of usable unique tags.
============ ======== =========== =====================
dict tag type marker size number of unique tags
============ ======== =========== =====================
``4x4_50`` Aruco 4x4 50
``4x4_100`` Aruco 4x4 100
``4x4_250`` Aruco 4x4 250
``4x4_1000`` Aruco 4x4 1000
``5x5_50`` Aruco 5x5 50
``5x5_100`` Aruco 5x5 100
``5x5_250`` Aruco 5x5 250
``5x5_1000`` Aruco 5x5 1000
``6x6_50`` Aruco 6x6 50
``6x6_100`` Aruco 6x6 100
``6x6_250`` Aruco 6x6 250
``6x6_1000`` Aruco 6x6 1000
``7x7_50`` Aruco 7x7 50
``7x7_100`` Aruco 7x7 100
``7x7_250`` Aruco 7x7 250
``7x7_1000`` Aruco 7x7 1000
``original`` Aruco ? ?
``16h5`` AprilTag 4x4 30
``25h9`` AprilTag 5x5 35
``36h10`` AprilTag 6x6 ?
``36h11`` AprilTag 6x6 587
============ ======== =========== =====================
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> img = Image.Read('tags.png')
>>> fiducials = im.fiducial('5x5_50')
>>> fiducials
>>> fiducials[0].corners
:note: ``side`` is the dimension of the square that contains the
small white squares inside the black background.
:references:
- Robotics, Vision & Control for Python, Section 13.6.1, P. Corke, Springer 2023.
:seealso: :class:`Fiducial`
"""
dictionary = _fiducial_dict(dict)
cornerss, ids, _ = cv.aruco.detectMarkers(self.mono().A, dictionary)
# corners is a list of marker corners, one element per tag
# each element is 1x4x2 matrix holding corner coordinates
fiducials = []
if K is not None and side is not None:
rvecs, tvecs, _ = cv.aruco.estimatePoseSingleMarkers(
cornerss, side, K, None
)
for id, rvec, tvec, corners in zip(ids, rvecs, tvecs, cornerss):
fiducials.append(Fiducial(id[0], corners[0].T, K, rvec, tvec))
else:
for id, corners in zip(ids, cornerss):
fiducials.append(Fiducial(id[0], corners[0].T))
return fiducials
# --------------------- supporting classes -------------------------------- #
class MSERFeature:
def __init__(self, image=None, **kwargs):
"""
Find MSERs
:param image: input image
:type image: :class:`Image`
:param kwargs: parameters passed to :func:`opencv.MSER_create`
Find all the maximally stable extremal regions in the image and
return an object that represents the MSERs found.
This class behaves like a list and each MSER is an element of the list.
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> img = Image.Read('shark2.png')
>>> msers = img.MSER()
>>> len(msers)
>>> msers[0]
>>> msers.bbox
:references:
- J. Matas, O. Chum, M. Urban, and T. Pajdla.
"Robust wide baseline stereo from maximally stable extremal regions."
Proc. of British Machine Vision Conference, pages 384-396, 2002.
- Robotics, Vision & Control for Python, Section 12.1.2.2, P. Corke, Springer 2023.
:seealso: :meth:`bbox` :meth:`points`
"""
if image is not None:
detector = cv.MSER_create(**kwargs)
msers, bboxes = detector.detectRegions(image.A)
# msers is a tuple of ndarray(M,2), each row is (u,v)
# bbox is ndarray(N,4), each row is l, r, w, h
# returns different things, msers is a list of points
# u, v, point=centroid, scale=area
# https://www.toptal.com/machine-learning/real-time-object-detection-using-mser-in-ios
self._points = [
mser.T for mser in msers
] # transpose point arrays to be Nx2
bboxes[:, 2:] = bboxes[:, 0:2] + bboxes[:, 2:] # convert to lrtb
self._bboxes = bboxes
def __len__(self):
"""
Number of MSER features
:return: number of features
:rtype: int
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> img = Image.Read("castle.png")
>>> mser = img.MSER()
>>> len(mser) # number of features
:seealso: :meth:`__getitem__`
"""
return len(self._points)
def __getitem__(self, i):
"""
Get MSERs from MSER feature object
:param i: index
:type i: int or slice
:raises IndexError: index out of range
:return: subset of point features
:rtype: :class:`MSERFeature` instance
This method allows a ``MSERFeature`` object to be indexed, sliced or iterated.
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> img = Image.Read("castle.png")
>>> mser = img.MSER()
>>> len(mser) # number of features
>>> mser[:5] # first 5 MSER features
>>> mser[::50] # every 50th MSER feature
:seealso: :meth:`len`
"""
new = self.__class__()
if isinstance(i, int):
new._points = self._points[i]
new._bboxes = self._bboxes[np.newaxis, i, :] # result is 2D
elif isinstance(i, slice):
new._points = self._points[i]
new._bboxes = self._bboxes[i, :] # result is 2D
elif isinstance(i, np.ndarray):
if np.issubdtype(i.dtype, bool):
new._points = [self._points[k] for k, true in enumerate(i) if true]
new._bboxes = self._bboxes[i, :]
elif np.issubdtype(i.dtype, np.integer):
new._points = [self._points[k] for k in i]
new._bboxes = self._bboxes[i, :]
elif isinstance(i, (list, tuple)):
new._points = [self._points[k] for k in i]
new._bboxes = self._bboxes[i, :]
return new
def __str__(self):
"""
String representation of MSER
:return: Brief readable description of MSER
:rtype: str
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> img = Image.Read("castle.png")
>>> msers = img.MSER()
>>> str(msers)
>>> str(msers[0])
"""
if len(self) > 1:
return f"MSER features, {len(self)} regions"
else:
s = f"MSER feature: u: {self._bboxes[0,0]} - {self._bboxes[0,2]}, v: {self._bboxes[0,1]} - {self._bboxes[0,3]}"
return s
def __repr__(self):
"""
Representation of MSER
:return: Brief readable description of MSER
:rtype: str
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> img = Image.Read("castle.png")
>>> msers = img.MSER()
>>> msers
>>> msers[0]
"""
return str(self)
@property
@array_result
def points(self):
"""
Points belonging to MSERs
:return: Coordinates of points in (u,v) format that belong to MSER
:rtype: ndarray(2,N), list of ndarray(2,N)
If the object contains just one region the result is an array, otherwise
it is a list of arrays (with different numbers of rows).
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> import numpy as np
>>> img = Image.Read("castle.png")
>>> msers = img.MSER()
>>> np.printoptions(threshold=10)
>>> msers[0].points
>>> msers[2:4].points
:seealso: :meth:`bbox`
"""
return self._points
@property
@array_result
def bbox(self):
"""
Bounding boxes of MSERs
:return: Bounding box of MSER in [umin, vmin, umax, vmax] format
:rtype: ndarray(4) or ndarray(N,4)
If the object contains just one region the result is a 1D array,
otherwise it is a 2D arrays with one row per bounding box.
Example:
.. runblock:: pycon
>>> from machinevisiontoolbox import Image
>>> img = Image.Read("castle.png")
>>> msers = img.MSER()
>>> msers[0].bbox
>>> msers[:4].bbox
:seealso: :meth:`points`
"""
return self._bboxes
class OCRWord:
def __init__(self, ocr, i):
"""
OCR word and metadata
:param ocr: dict from Tesseract
:type ocr: dict of lists
:param i: index of word
:type i: int
:return: OCR data for word
:rtype: :class:`OCRWord` instance
Describes a word detected by OCR including its metadata which is available
as a number of properties:
========== =======================================================
Property Meaning
========== =======================================================
``text`` recognized text
``conf`` confidence in text recognition (percentage)
``l`` left coordinate (umin) of rectangle containing the text
``t`` top coordinate (vmin) of rectangle containing the text
``w`` height of rectangle containing the text
``h`` height of rectangle containing the text
``ltrb`` bounding box [left, top, right, bottom]
========== =======================================================
:seealso: :meth:`~machinevisiontoolbox.ImageFeatures.ImageFeaturesMixin.ocr`
"""
self.dict = {}
for key in ocr.keys():
self.dict[key] = ocr[key][i]
def __str__(self):
"""
String representation of MSER
:return: Brief readable description of OCR word
:rtype: str
"""
return f"{self.dict['text']} ({self.dict['conf']}%)"
def __repr__(self):
return str(self)
@property
def l(self):
"""
Left side of word bounding box
:return: left side coordinate of bounding box in pixels
:rtype: int
:seealso: :meth:`t` :meth:`ltrb`
"""
return self.dict["left"]
@property
def t(self):
"""
Top side of word bounding box
:return: top side coordinate of bounding box in pixels
:rtype: int
:seealso: :meth:`l` :meth:`ltrb`
"""
return self.dict["top"]
@property
def w(self):
"""
Width of word bounding box
:return: width of bounding box in pixels
:rtype: int
:seealso: :meth:`h` :meth:`ltrb`
"""
return self.dict["width"]
@property
def h(self):
"""
Height of word bounding box
:return: height of bounding box in pixels
:rtype: int
:seealso: :meth:`w` :meth:`ltrb`
"""
return self.dict["height"]
@property
def ltrb(self):
"""
Word bounding box
:return: bounding box [left top right bottom] in pixels
:rtype: list
:seealso: :meth:`l` :meth:`t` :meth:`w` :meth:`h`
"""
return [
self.dict["left"],
self.dict["top"],
self.dict["left"] + self.dict["width"],
self.dict["top"] + self.dict["height"],
]
@property
def conf(self):
"""
Word confidence
:return: confidence of word (percentage)
:rtype: int
:seealso: :meth:`text`
"""
return self.dict["conf"]
@property
def text(self):
"""
Word as a string
:return: word
:rtype: str
:seealso: :meth:`conf`
"""
return self.dict["text"]
def plot(self):
"""
Plot word and bounding box
Plot a label box around the word in the image, and show the OCR string
in the label field.
:seealso: :func:`~machinevisiontoolbox.base.graphics.plot_labelbox`
"""
plot_labelbox(
self.text,
tl=(self.l, self.t),
wh=(self.w, self.h),
color="y",
linestyle="--",
)
class Fiducial:
def __init__(self, id, corners, K=None, rvec=None, tvec=None):
"""
Properties of a visual fiducial marker
:param id: identity of the marker
:type id: int
:param corners: image plane marker corners
:type corners: ndarray(2, 4)
:param K: camera intrinsics
:type K: ndarray(3,3), optional
:param rvec: translation of marker with respect to camera, as an Euler vector
:type rvec: ndarray(3), optional
:param tvec: translation of marker with respect to camera
:type tvec: ndarray(3), optional
:seealso: :meth:`id` :meth:`pose` :meth:`draw`
:meth:`~machinevisiontoolbox.ImageFeatures.ImageFeaturesMixin.fiducial`
"""
self._id = id
self.corners = corners # strip first dimensions
self.K = K
self._pose = SE3(tvec) * SE3.EulerVec(rvec.flatten())
self.rvec = rvec
self.tvec = tvec
def __str__(self):
"""
String representation of fiducial
:return: Brief readable description of fidicual id and pose
:rtype: str
"""
s = f"id={self.id}"
if self.pose is not None:
s += ": " + self.pose.strline()
return s
def __repr__(self):
return str(self)
# def plot(self, ax=None):
# ax = _axes_logic(ax, 2)
@property
def id(self):
"""
Fiducial id
:return: fiducial marker identity
:rtype: int
Returns the built in identity code of the April tag or arUco marker.
"""
return self._id
@property
def pose(self):
"""
Fiducial pose
:return: marker pose
:rtype: SE3
Returns the pose of the tag with respect to the camera. The x- and
y-axes are in the marker plane and the z-axis is out of the marker.
:note: Accurate camera intrinsics and dimension parameters are
required for this value to be metric.
"""
return self._pose
def draw(self, image, length=100, thick=2):
"""
Draw marker coordinate frame into image
:param image: image with BGR color order
:type image: :class:`Image`
:param length: axis length in pixels, defaults to 100
:type length: int, optional
:param thick: axis thickness in pixels, defaults to 2
:type thick: int, optional
:raises ValueError: image must have BGR color order
Draws a coordinate frame into the image representing the pose of the
marker. The x-, y- and z-axes are drawn as red, green and blue line
segments.
"""
if not image.isbgr:
raise ValueError("image must have BGR color order")
cv.drawFrameAxes(
image.A, self.K, np.array([]), self.rvec, self.tvec, length, thick
)
[docs]
class ArUcoBoard:
# potentially inherit from abstract MarkerBoard class
def __init__(self, layout, sidelength, separation, dict, name=None):
"""Create a MarkerBoard object
:param layout: number of markers in the x- and y-directions
:type layout: 2-tuple of int
:param sidelength: Side length of each marker
:type sidelength: float
:param separation: White space between markers, must be the same in both directions
:type separation: float
:param dict: marker type, eg. '6x6_1000'
:type dict: str
:param name: name of the board, defaults to None
:type name: str, optional
:raises ValueError: if the ``layout`` is not a 2-tuple of integers
This object represents a board of markers, such as an ArUco board. The board comprises
a regular grid of markers each of which has a known ``sidelength`` and ``separation``. The grid
has :math:`n_x \times n_y` markers in the x and y directions respectively, and
``layout``=:math:`(n_x, n_y)`. The type of markers, ArUco or custom, is specified by the
``dict`` parameter.
:note: the dimensions must be in the same units as camera focal length and
pixel size, typically meters.
"""
self._layout = layout
if len(layout) != 2:
raise ValueError("layout must be a tuple of two integers")
self._sidelength = sidelength
self._separation = separation
self._name = name
self._dict = _fiducial_dict(dict)
self._board = cv.aruco.GridBoard(layout, sidelength, separation, self._dict)
[docs]
def estimatePose(self, image, camera):
"""Estimate the pose of the board
:param image: image containing the board
:type image: Image
:param camera: model of the camera, including intrinsics and distortion parameters
:type camera: :class:`CentralCamera`
:raises ValueError: the boards pose could not be estimated
:return: Camera pose with respect to board origin, vector of residuals in units of pixels in marker ID order, corresponding marker IDs
:rtype: 3-tuple of SE3, numpy.ndarray, numpy.ndarray
Residuals are the Euclidean distance between the detected marker corners and the
reprojected corners in the image plane, in units of pixels. The mean and maximum
residuals are useful for assessing the quality of the pose estimate.
"""
# find the markers
corners, ids, rejected = cv.aruco.detectMarkers(image.mono().A, self._dict)
# match the markers to the board
objPoints, imgPoints = self._board.matchImagePoints(corners, ids)
# solve for camera pose
retval, rvec, tvec = cv.solvePnP(
objPoints, imgPoints, camera.K, camera.distortion
)
if not retval:
raise ValueError("solvePnP failed")
# print(f"rotation: {rvec.T}")
# print(f"translation: {tvec.T}")
self._tvec = tvec
self._rvec = rvec
# compute the reprojection error
reprojection, _ = cv.projectPoints(
objPoints, rvec, tvec, camera.K, camera.distortion
)
diff = (imgPoints - reprojection).squeeze()
residuals = np.linalg.norm(diff, axis=1)
T = SE3(tvec) * SE3.EulerVec(rvec.flatten())
return T, residuals, ids.flatten()
[docs]
def draw(self, image, camera, length=0.1, thick=2):
"""
Draw board coordinate frame into image
:param image: image with BGR color order
:type image: :class:`Image`
:param length: axis length in metric units, defaults to 0.1
:type length: float, optional
:param thick: axis thickness in pixels, defaults to 2
:type thick: int, optional
:raises ValueError: image must have BGR color order
Draws a coordinate frame into the image representing the pose of the
board. The x-, y- and z-axes are drawn as red, green and blue line
segments.
:note: the ``length`` is specified in the same units as focal length and
pixel size of the camera, and the marker dimensions, typically meters.
"""
if not image.isbgr:
raise ValueError("image must have BGR color order")
cv.drawFrameAxes(
image.A, camera.K, camera.distortion, self._rvec, self._tvec, length, thick
)
[docs]
def chart(self, filename, dpi=100):
"""Write ArUco chart to a file
:param filename: name of the file to write
:type filename: str
:param dpi: dots per inch of printer, defaults to 100
:type dpi: int, optional
PIL is used to write the file, and can support multiple formats (specified
by the file extension) such as PNG, PDF, etc.
If a PDF file is written the chart can be printed at 100% scale factor and will
have the correct dimensions. The size is of the chart is invariant to the
``dpi`` parameter, simply affects the resolution of the image and file size.
:note: This method assumes that the dimensions given in the constructor are in
meters.
"""
# dots per m
dpm = dpi * 1000 / 25.4
# compute size of chart in metres based on marker size and separation
width = (
self._layout[0] * (self._sidelength + self._separation) - self._separation
)
height = (
self._layout[1] * (self._sidelength + self._separation) - self._separation
)
# convert to pixels
width = int(width * dpm)
height = int(height * dpm)
# generate the image
img = self._board.generateImage((width, height))
# return Image(img)
from PIL import Image
img = Image.fromarray(img)
img.save(filename, dpi=(dpi, dpi))
if __name__ == "__main__":
from machinevisiontoolbox import Image
im = Image.Read("castle.png")
mser = im.MSER()
print(len(mser))
print(mser)
m0 = mser[0]
print(m0)
print(m0.bbox.shape)
print(m0.bbox)
print(m0.points.shape)
print(m0.points)
mm = mser[:5]
print(mm)
print(mm.bbox.shape)
print(mm.bbox)
print(len(mm))
print(mm.points)
k = np.arange(len(mser)) < 5
mm = mser[k]
print(mm)
print(mm.bbox.shape)
print(mm.bbox)
print(len(mm))
print(mm.points)
k = [0, 2, 1, 3, 4]
mm = mser[k]
print(mm)
print(mm.bbox.shape)
print(mm.bbox)
print(len(mm))
print(mm.points)
k = np.array([0, 2, 1, 3, 4])
mm = mser[k]
print(mm)
print(mm.bbox.shape)
print(mm.bbox)
print(len(mm))
print(mm.points)
pass