Source code for inaFaceAnalyzer.face_detector

#!/usr/bin/env python
# encoding: utf-8

# The MIT License

# Copyright (c) 2021 Ina (David Doukhan & Zohra Rezgui- http://www.ina.fr/)

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

"""
Face detection classes are in charge of finding faces in image frames.

Two face detection classes are provided : 
    - :class:`LibFaceDetection` (default) 
    - :class:`OcvCnnFacedetector`.


Face detection classes inherits from abstract class :class:`FaceDetector` and share a common interface.
They are designed as `*functions objects* or *functors* <https://en.wikipedia.org/wiki/Function_object>`_
using image frame inputs and returning list of :class:`Detection` instances.

>>> from inaFaceAnalyzer.opencv_utils import imread_rgb
>>> from inaFaceAnalyzer.face_detector import LibFaceDetection
>>> # read image
>>> img = imread_rgb('./media/dknuth.jpg')
>>> # instantiate a detector (costly - to be done a single time)
>>> detector = LibFaceDetection()
>>> #call the detector instance as a function - setting verbose to True is slower, but display intermediate results
>>> ldetections = detector(img, verbose=True)
>>> print(ldetections)
[Detection(bbox=Rect(x1=113.9406801111573, y1=63.12627956950275, x2=287.63299981285394, y2=280.43775060093793), detect_conf=0.9999985098838806)]

"""


from abc import ABC, abstractmethod
from typing import NamedTuple
import cv2
import numpy as np
import onnxruntime

from .rect import Rect
from .remote_utils import get_remote
from .opencv_utils import disp_frame_shapes, disp_frame
from .libfacedetection_priorbox import PriorBox


[docs]class Detection(NamedTuple):
    """
    Atomic element returned by face detection classes
    """
    #: position of the detected face in the image in pixels
    bbox : Rect
    #: face detection confidence (0 = lowest confidence, 1 = highest confidence)
    detect_conf : float


# Currently, we wish to use the same eye detection procedure for all detection engines
# class DetectionEyes(NamedTuple):
#     """
#     Contains a detection (Rect bounding box & detection confidence)
#     + eyes coordinates (x1, y1, x2, y2) for left eye and right eye
#     """
#     bbox : Rect
#     detect_conf : float
#     eyes : Rect


[docs]class FaceDetector(ABC):
    
    #    @classmethod
    #    @abstractmethod
    #    def output_type() : pass
    output_type = Detection

    
[docs]    def __init__(self, minconf, min_size_px, min_size_prct, padd_prct):
        """
        Common face detection constructor

        Args:
            minconf (float between 0 and 1): the minimal face detection confidence being returned (default values dependent on the face detection class choosen).
            min_size_px (int): minimal face size in pixels (default 30): better classification results requires face sizes above 75 pixels.
            min_size_prct (float between 0 and 1): minimal face size as a percentage of image frame minimal dimension. Allow to focus on the most relevant faces.
            padd_prct (float between 0 and 1): percentage of black padding pixels to be applied on images before detection (default values are set or each detection class).

        """
        self.minconf = minconf
        self.min_size_px = min_size_px
        self.min_size_prct = min_size_prct
        self.padd_prct = padd_prct

[docs]    def __call__(self, frame, verbose=False):
        """
        Perform face detection on image frames

        Args:
            frame (:class:`numpy.ndarray`): RGB image frame (height, width, 3).
            verbose (bool, optional): display intermediate results such as detected faces Not to be used in production. Defaults to False.

        Returns:
            list of :class:`Detection` instances
        """        
        
        tmpframe = frame

        if self.padd_prct:
            tmpframe, yoffset, xoffset = _blackpadd(frame, self.padd_prct)

        lret = self._call_imp(tmpframe)


        # filter detected faces to return only faces with a dimension length
        # (absolute or relative)
        # face classification algorithms may be affected by small face sizes
        min_frame_dim = min(frame.shape[:2])
        min_face_size = max(self.min_size_px, self.min_size_prct * min_frame_dim)
        if min_face_size > 0:
            lret = [e for e in lret if e.bbox.max_dim_len >= min_face_size]

        if self.padd_prct:
            lret = [e._replace(bbox=e.bbox.transpose(-xoffset, -yoffset)) for e in lret]

        if verbose:
            print('%d DETECTED FACES' % len(lret))
            disp_frame_shapes(frame, [e.bbox for e in lret], [])
            for detection in lret:
                x1, y1, x2, y2 = [e for e in detection.bbox.to_int()]
                print(detection)
                x1 = max(x1, 0)
                y1 = max(y1, 0)
                x2 = min(x2, frame.shape[1])
                y2 = min(y2, frame.shape[0])
                disp_frame(frame[y1:y2, x1:x2, :])


        return lret


    @abstractmethod
    def _call_imp(self, frame): pass

[docs]    def most_central_face(self, frame, contain_center=True, verbose=False):
        """
        To be used for processing ML datasets and training new face classification models.
        
        Some ML face corpora images containing several faces, with the target annotated face at the center.
        
        This method returns the detected face which is closest from the center of the image frame

        Args:
            frame (:class:`numpy.ndarray`): RGB image frame (height, width, 3)
            contain_center (bool, optional): if True, the returned face MUST include image center. Defaults to True.
            verbose (bool, optional): Display detected faces. Defaults to False.

        Returns:
            Detection: if a face matching the conditions has been detected, else None

        """
        frame_center = (frame.shape[1] / 2, frame.shape[0] / 2)

        # keep faces containing image center
        if contain_center:
            faces = [f for f in self(frame, verbose) if frame_center in f.bbox]
        else:
            faces = [f for f in self(frame, verbose)]

        if len(faces) == 0:
            return None

        ldists = [_sqdist(f.bbox.center, frame_center) for f in faces]
        am = np.argmin(ldists)

        return faces[am]


[docs]    def get_closest_face(self, frame, ref_bbox, min_iou=.7, squarify=True, verbose=False):
        """
        To be used for processing ML datasets and training new face classification models.
    
        Some face corpora images may contain several annotated faces.
        This method return the detected face having the largest IOU with target ref_box.
        The IOU must be > to min_iou.

        Args:
            frame (:class:`numpy.ndarray`): RGB image frame (height, width, 3).
            ref_bbox (tuple or Rect): reference face bounding box (x1, y1, x2, y2).
            min_iou (float, optional): minimal acceptable intersection over union between
                the detected face to be returned and the reference bounding box. Defaults to .7.
            squarify (TYPE, optional): if True, returns the smallest square
                bounding box containing the detected face. If False returns the
                original detected face bounding box. Defaults to True.
            verbose (TYPE, optional): display intermediate results. Defaults to False.

        Returns:
            :class:`Detection` or None: detected face matching the criteria (largest IOU with ref_bbox and IOU > min_iou), else None
        """
        
        if not isinstance(ref_bbox, Rect):
            ref_bbox = Rect(*ref_bbox)

        # get closest detected faces from ref_bbox
        if squarify:
            f = lambda x: x.square
        else:
            f = lambda x: x

        ref_bbox = f(ref_bbox)

        lfaces = self(frame, verbose)
        if len(lfaces) == 0:
            return None

        liou = [f(ref_bbox).iou(f(detection.bbox)) for detection in lfaces]


        if verbose:
            print([f(detection.bbox) for detection in lfaces])
            print('liou', liou)
        am = np.argmax(liou)
        if liou[am] < min_iou:
            return None
        return lfaces[am]



def _sqdist(p1, p2):
    '''
    return squared distance between points p1(x,y) and p2(x,y)
    '''
    return (p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2

def _blackpadd(frame, paddpercent):
    # add black around image
    y, x, z = frame.shape
    #offset = int(max(x, y) * paddpercent)
    xoffset = int(x * paddpercent)
    yoffset = int(y * paddpercent)
    ret = np.zeros((y + 2 * yoffset, x + 2 * xoffset, z), dtype=frame.dtype)
    ret[yoffset:(y + yoffset), xoffset:(x + xoffset), :] = frame
    return ret, yoffset, xoffset



[docs]class OcvCnnFacedetector(FaceDetector):
    """
    This class wraps OpenCV default CNN face detection model.
    Images are fist resized to 300*300 pixels, which may result in missing the
    smallest faces but allows to get fast detection time.
    """
    #output_type = Detection

[docs]    def __init__(self, minconf=0.65, min_size_px=30, min_size_prct=0, padd_prct=0.15):

        super().__init__(minconf, min_size_px, min_size_prct, padd_prct)

        fpb = get_remote('opencv_face_detector_uint8.pb')
        fpbtxt = get_remote('opencv_face_detector.pbtxt')
        self.model = cv2.dnn.readNetFromTensorflow(fpb, fpbtxt)


    def _call_imp(self, frame):
        """
        Detect faces from an image

        Parameters:
            frame (array): Image to detect faces from.

        Returns:
            faces_data (list) : List containing :
                                - the bounding box
                                - face detection confidence score
        """

        faces_data = []
        h, w, z = frame.shape

        # The CNN is intended to work images resized to 300*300
        # tests were carried on using different input size and were associated
        # to usatisfactory results
        blob = cv2.dnn.blobFromImage(frame, 1.0, (300, 300), [104, 117, 123], True, False)
        self.model.setInput(blob)
        detections = self.model.forward()

        assert(np.all(-np.sort(-detections[:,:,:,2]) == detections[:,:,:,2]))

        for i in range(detections.shape[2]):
            confidence = detections[0, 0, i, 2]

            if confidence < self.minconf:
                break

            bbox = Rect(*detections[0, 0, i, 3:7])
            # remove noisy detections coordinates
            if bbox.x1 >= 1 or bbox.y1 >= 1 or bbox.x2 <= 0 or bbox.y2 <= 0:
                continue
            if bbox.x1 >= bbox.x2 or bbox.y1 >= bbox.y2:
                continue

            # Map relative coordinates 0...1 to absolute  frame width and height
            bbox = bbox.mult(w, h)
            faces_data.append(Detection(bbox, confidence))

        return faces_data


[docs]class LibFaceDetection(FaceDetector):
    """
    This class wraps the face detection model provided in
    `libfacedetection <https://github.com/ShiqiYu/libfacedetection>`_ :
    a recent face detection library (2021) that
    can take advantage of GPU acceleration and is able de detect the smallest faces.
    It may be slow when used with high resolution images.

    For more details, please refer to :
    Peng, H., & Yu, S. (2021). A systematic iou-related method: Beyond simplified regression for better localization. IEEE Transactions on Image Processing, 30, 5032-5044.
    """

    # output_type = DetectionEyes
    #output_type = Detection

[docs]    def __init__(self, minconf=.98, min_size_px=30, min_size_prct=0, padd_prct=0): 
        super().__init__(minconf, min_size_px, min_size_prct, padd_prct)
        model_src = get_remote('libfacedetection-yunet.onnx')
        try:
            self.model = onnxruntime.InferenceSession(model_src, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
        except:
            self.model = onnxruntime.InferenceSession(model_src, providers=['CPUExecutionProvider'])
        self.nms_thresh = 0.3 # Threshold for non-max suppression
        self.keep_top_k = 750 # Keep keep_top_k for results outputing
        self.dprior = {}

    def _call_imp(self, frame):
        bgr_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        h, w, _ = frame.shape

        # convert to NN input
        blob = np.expand_dims(np.transpose(bgr_frame, (2, 0, 1)), axis = 0).astype(np.float32)
        # NN inference
        loc, conf, iou = self.model.run([], {'input': blob})

        # Decode bboxes and landmarks
        # TODO: set a limit of dict length ? There may be RAM issues when
        # considering a large image collection with heterogenous sizes
        if (w, h) not in self.dprior:
            self.dprior[(w, h)] = PriorBox(input_shape=(w, h), output_shape=(w, h))
        pb = self.dprior[(w, h)]

        dets = pb.decode(loc, conf, iou, self.minconf)

        # dirty hack used for google collab compatibility
        if len(dets.shape) == 3 and dets.shape[1] == 1:
            dets = dets.reshape((dets.shape[0], dets.shape[2]))
        assert len(dets.shape) == 2, dets.shape

        # NMS
        if dets.shape[0] > 0:
             # NMS from OpenCV
             keep_idx = cv2.dnn.NMSBoxes(
                  bboxes=dets[:, 0:4].tolist(),
                  scores=dets[:, -1].tolist(),
                  score_threshold=self.minconf,
                  nms_threshold=self.nms_thresh,
                  eta=1,
                  top_k=self.keep_top_k)
             dets = dets[keep_idx]
        else:
            return []

        # dirty hack used for google collab compatibility
        # it works - to be investiguated
        if len(dets.shape) == 3 and dets.shape[1] == 1:
            dets = dets.reshape((dets.shape[0], dets.shape[2]))
        assert len(dets.shape) == 2, dets.shape
        assert dets.shape[1] == 15, dets.shape

        lret = []
        for i in range(len(dets)):
            score = dets[i,-1]
            x1, y1, w, h = dets[i,:4]
            bbox = Rect(x1, y1, x1 + w, y1 + h)
            #eyes = Rect(*dets[i, 4:8])
            #lret.append(DetectionEyes(bbox, score, eyes))
            lret.append(Detection(bbox, score))

        return lret

[docs]class IdentityFaceDetector(FaceDetector):
    """
    This class do not detect faces and return bouding boxes corresponding to
    the whole image frame.
    It should be used for processing images or videos corresponding to
    already-detected cropped faces.
    """
    #output_type = Detection
[docs]    def __init__(self):
        """
        IdentityFaceDetector Constructor does not require arguments
        """
        super().__init__(0, 0, 0, 0)
    def _call_imp(self, frame):
        return [Detection(Rect(0, 0, frame.shape[1], frame.shape[0]), np.NAN)]


class PrecomputedDetector(FaceDetector):
    #output_type = Detection
    def __init__(self, lbbox = []):
        super().__init__(0, 0, 0, 0)
        self.lbbox = lbbox.copy()
    def _call_imp(self, frame):
        if len(self.lbbox) == 0:
            return []
        ret = self.lbbox.pop(0)
        if isinstance(ret, tuple):
            ret = [ret]
        return [Detection(Rect(*e), None) for e in ret]


def facedetection_cmdline(parser):
    '''
    Update command line parser with face detection related arguments
    Parameters
    ----------
    parser : argparse.ArgumentParser
        command line parser to be updated

    '''
    da = parser.add_argument_group('optional arguments related to face detection')

    da.add_argument ('--face_detector', default='LibFaceDetection',
                     choices=['LibFaceDetection', 'OcvCnnFacedetector'],
                     help='''face detection module to be used:
                         LibFaceDetection can take advantage of GPU acceleration and has a higher recall.
                         OcvCnnFacedetector is embedded in OpenCV. It is faster for large resolutions since it first resize input frames to 300*300. It may miss small faces''')

    da.add_argument('--face_detection_confidence', type=float,
                    help='''minimal confidence threshold to be used for face detection.
                        Default values are 0.98 for LibFaceDetection and 0.65 for OcvCnnFacedetector''')


    da.add_argument('--min_face_size_px', default=30, type=int, dest='size_px',
                    help='''minimal absolute size in pixels of the faces to be considered for the analysis.
                    Optimal classification results are obtained for sizes above 75 pixels.''')

    da.add_argument('--min_face_size_percent', default=0, type=float, dest='size_prct',
                    help='''minimal relative size (percentage between 0 and 1) of the
                    faces to be considered for the analysis with respect to image frames
                    minimal dimension (generally height for videos)''')

    da.add_argument('--face_detection_padding', default=None, type=float, dest='face_detection_padding',
                    help='''Black padding percentage to be applied to image frames before face detection.
                    0.15 Padding may help detecting large faces occupying the whole image with OcvCnnFacedetector.
                    Default padding values are 0.15 for OcvCnnFacedetector and 0 for LibFaceDetection''')

def facedetection_factory(args):
    '''
    Instanciate a face detection object from parsed command line arguments

    Parameters
    ----------
    args : Namespace
        Namespace containing fields face_detector, face_detection_confidence,
        min_face_size_px, min_face_size_percent

    Returns
    -------
    instance of class FaceDetector

    '''
    dargs = {'min_size_px': args.size_px, 'min_size_prct': args.size_prct}
    if args.face_detection_padding is not None:
        dargs['padd_prct'] = args.face_detection_padding
    if args.face_detection_confidence:
        dargs['minconf'] = args.face_detection_confidence

    if args.face_detector == 'LibFaceDetection':
        detector = LibFaceDetection(**dargs)
    elif args.face_detector == 'OcvCnnFacedetector':
        detector = OcvCnnFacedetector(**dargs)
    else:
        raise NotImplementedError(args.face_dector)

    return detector