Source code for inaFaceAnalyzer.inaFaceAnalyzer

#!/usr/bin/env python
# encoding: utf-8

# The MIT License

# Copyright (c) 2019-2021 Ina (David Doukhan & Zohra Rezgui - http://www.ina.fr/)

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

"""
inaFaceAnalyzer module implements four analysis engines allowing to process
video or image streams :

    - :class:`ImageAnalyzer` : default choice for image files (jpg, png, etc...)
    - :class:`VideoAnalyzer` : default choice for video files (MP4, avi, etc..)
    - :class:`VideoKeyframes` : do process only video `keyframes <https://en.wikipedia.org/wiki/Video_compression_picture_types>`_ (faster decoding)
    - :class:`VideoTracking` : Face detection is combined with face tracking



Media analyzer classes share a common interface inherited from abstract class :class:`FaceAnalyzer`.
They are designed as `*functions objects* or *functors* <https://en.wikipedia.org/wiki/Function_object>`_ 
and can be used as functions, executing the code implemented in `__call__` methods,
with first argument corresponding to the media to analyze and returning :class:`pandas.DataFrame`



Custom face detection,  face classifier, eye detection and image preprocessing strategies can be provided in the constructor.

>>> from inaFaceAnalyzer.inaFaceAnalyzer import VideoAnalyzer
>>> # a video analyzer instance with default parameters
>>> va = VideoAnalyzer()
>>> df = va(sample_vid)
"""


import pandas as pd
from abc import ABC, abstractmethod
from .opencv_utils import video_iterator, image_iterator, analysisFPS2subsamp_coeff, imwrite_rgb
from .pyav_utils import video_keyframes_iterator
from .face_tracking import TrackerDetector
from .face_detector import LibFaceDetection, PrecomputedDetector
from .face_classifier import Resnet50FairFaceGRA
from .face_alignment import Dlib68FaceAlignment
from .face_preprocessing import preprocess_face


[docs]class FaceAnalyzer(ABC):
    """
    This is an abstract class containg the pipeline used to process
    images, videos, with/without tracking
    * image/video decoding
    * face detection
    * face tracking (optional)
    * eye detection
    * face preprocessing
    * face classification
    """

    # len of batches to be sent to face classifiers
    #batch_len = 32

[docs]    def __init__(self, face_detector = None, face_classifier = None, batch_len=32, verbose = False):
        """
        Construct a face processing pipeline composed of a face detector, 
            a face preprocessing strategy and a face classifier.
            The face preprocessing strategy is defined based on the classifier's properties.

        Args:
            face_detector (:class:`inaFaceAnalyzer.face_detector.FaceDetector` or None, optional): \
                face detection object to be used. \
                If None, create a new instance of :class:`inaFaceAnalyzer.face_detector.LibFaceDetection`. \
                Defaults to None.
            face_classifier (:class:`inaFaceAnalyzer.face_classifier.FaceClassifier` or None, optional): \
                Face classification object to be used.\
                if None, create a new instance of :class:`inaFaceAnalyzer.face_classifier.Resnet50FairFaceGRA`. \
                    Defaults to None.
            batch_len (int, optional): Size of batches to be sent to the GPU. \
                Larger batches allow faster processing results but require more GPU memory. \
                batch_len balue should be set according to the available hardware. \
                Defaults to 32.
            verbose (bool, optional): if True, display several intermediate \
                images and results - usefull for debugging but should be avoided \
                    in production. Defaults to False.

        """

        # face detection system
        if face_detector is None:
            face_detector = LibFaceDetection()
        self.face_detector = face_detector

        # Face feature extractor from aligned and detected faces
        if face_classifier is None:
            face_classifier = Resnet50FairFaceGRA()
        self.classifier = face_classifier

        # if set to True, then the bounding box (manual or automatic) is set
        # to the smallest square containing the bounding box
        self.bbox2square = face_classifier.bbox2square

        # scaling factor to be applied to the face bounding box after detection
        # larger bounding box may help for sex classification from face
        self.bbox_scale = face_classifier.bbox_scale

        # face alignment module
        self.face_alignment = Dlib68FaceAlignment()

        # True if some verbose is required
        assert isinstance(verbose, bool)
        self.verbose = verbose

        # set to large values with large memory GPU for faster processing times !
        assert isinstance(batch_len, int) and batch_len > 0
        self.batch_len = batch_len


[docs]    @abstractmethod
    def __call__(self, src) :
        """
        Method to be implemented by each analyzer

        Parameters
        ----------
        src : str or list
            path to the video/image to be analyzed
            May also be a list of images
            
        Returns
        -------
        Results stored in a :class:`pandas.DataFrame`
        """
        
        
        
        
        pass

    def _process_stream(self, stream_iterator, detector):
        """
        Generic pipeline allowing to process image or video streams
        Faces are first detected, preprocessed and sent in batches in
        face classifiers

        Parameters
        ----------
        stream_iterator : iterator
            iterator returning decoded RBG images at each call together with
            an image identifier
            see: opencv_utils.video_iterator, opencv_utils.image_iterator,
            pyav_utils.video_keyframes_iterator
        detector : instance of face_tracking, tracker_detector or face_detector.FaceDetector

        Returns
        -------
        TYPE
            pandas Dataframe with analysis results

        """
        oshape = self.classifier.input_shape[:-1]

        lbatch_img = []
        linfo = []
        ldf = []

        # iterate on image list or video stream
        for iframe, frame in stream_iterator:

            # iterate on detected faces
            for detection in detector(frame, self.verbose):

                # preprocess detected faces: bbox normalization, eye detection, rotation, ...
                face_img, bbox = preprocess_face(frame, detection, self.bbox2square, self.bbox_scale, self.face_alignment, oshape, False)

                linfo.append([iframe, detection._replace(bbox=tuple(bbox))])
                # store preprocessed faces in a list, for further batch processing
                lbatch_img.append(face_img)

            # if enough faces were found, process a batch of faces
            while len(lbatch_img) > self.batch_len:
                df = self.classifier(lbatch_img[:self.batch_len], verbose=self.verbose)
                ldf.append(df)
                lbatch_img = lbatch_img[self.batch_len:]


        if len(lbatch_img) > 0:
            df = self.classifier(lbatch_img, verbose=self.verbose)
            ldf.append(df)

        if len(ldf) == 0:
            return pd.DataFrame(None, columns=(['frame'] + list(detector.output_type._fields) + self.classifier.output_cols))

        # return results as a pandas Dataframe
        df1 = pd.DataFrame({'frame' : [e[0] for e in linfo]})
        df2 = pd.DataFrame.from_records([e[1] for e in linfo], columns=detector.output_type._fields)
        if 'eyes' in df2.columns:
            df2 = df2.drop('eyes', axis=1)
        df3 = pd.concat(ldf).reset_index(drop=True)
        return pd.concat([df1, df2, df3], axis = 1)


[docs]class ImageAnalyzer(FaceAnalyzer):
    """
    ImageAnalyzer instances allow to detect and classify faces from images
    
    ====  ===================================  ====================  =============  =============  =============  ===========  ===========
      ..  frame                                bbox                    detect_conf    sex_decfunc    age_decfunc  sex_label      age_label
    ====  ===================================  ====================  =============  =============  =============  ===========  ===========
       0  ./media/dknuth.jpg                   (81, 52, 320, 291)         0.999999        7.24841        6.68495  m                61.8495
       1  ./media/800px-India_(236650352).jpg  (193, 194, 494, 495)       1               9.96501        5.76855  m                52.6855
       2  ./media/800px-India_(236650352).jpg  (472, 113, 694, 336)       0.999992       15.1933         4.09797  m                35.9797
       3  ./media/800px-India_(236650352).jpg  (40, 32, 109, 101)         0.999967       11.3448         4.35364  m                38.5364
       4  ./media/800px-India_(236650352).jpg  (384, 54, 458, 127)        0.999964       11.3798         4.36526  m                38.6526
       5  ./media/800px-India_(236650352).jpg  (217, 67, 301, 151)        0.999899        9.78476        4.8296   m                43.296
    ====  ===================================  ====================  =============  =============  =============  ===========  ===========
    
    """
[docs]    def __call__(self, img_paths):
        """
        Parameters
        ----------
        img_paths : str or list
            path or list of paths to image file(s) to analyze
        Returns
        -------
        pandas Dataframe with column 'frame' containing the path to the source
        image. Remaining columns depend on processing options selected and
        contain bounding box, and face classification information

        """
        if isinstance(img_paths, str):
            stream = image_iterator([img_paths], verbose = self.verbose)
        else:
            stream = image_iterator(img_paths, verbose = self.verbose)
        return self._process_stream(stream, self.face_detector)


[docs]class VideoAnalyzer(FaceAnalyzer):
    """
    Video Analyzer allows to detect and classify faces in video streams
    """
[docs]    def __call__(self, video_path, fps = None,  offset = 0):
        """
        Pipeline function for face classification from videos (without tracking)

        Parameters:
            video_path: str
                Path to input video.
            fps: float or None (default)
                amount of video frames to process per seconds
                if set to None, all frames are processed (costly)
            offset: float (default: 0)
                Time in milliseconds to skip at the beginning of the video.

        Returns:
            Dataframe with frame and face information: frame position,
            coordinates, predictions, decision function,labels...
        """
        subsamp_coeff = 1 if fps is None else analysisFPS2subsamp_coeff(video_path, fps)
        stream = video_iterator(video_path, subsamp_coeff=subsamp_coeff, time_unit='ms', start=max(offset, 0), verbose=self.verbose)
        return self._process_stream(stream, self.face_detector)


[docs]class VideoKeyframes(FaceAnalyzer):
    """
    Face detection and analysis from video limited to video key frames
    https://en.wikipedia.org/wiki/Key_frame
    It allows to provide a video analysis summary in fast processing time, but
    with non uniform frame sampling rate
    """
[docs]    def __call__(self, video_path):

        """
        Pipeline function for face classification from videos, limited to key frames

        Parameters:
            video_path (string): Path for input video.
        Returns:
            Dataframe with frame and face information: frame position,
            coordinates, predictions, decision function,labels...
        """
        stream = video_keyframes_iterator(video_path, verbose=self.verbose)
        return self._process_stream(stream, self.face_detector)

    def extract_faces(self, df, video_path, output_dir, oshape=None, bbox_scale=None, ext='png'):
        '''
        Extract faces found with keyframe analysis to directory output_dir
        '''
        
        if oshape is None:
            oshape = self.classifier.input_shape[:-1]
        if bbox_scale is None:
            bbox_scale = self.bbox_scale


        vki = video_keyframes_iterator(video_path, verbose=self.verbose)
        iframe, frame = next(vki)

        detector = PrecomputedDetector(list(df.bbox))
        
        for ituple, t in enumerate(df.itertuples()):
            while iframe != t.frame:
                iframe, frame = next(vki)
            detection = detector(frame)
            assert len(detection) == 1, len(detection)
            detection = detection[0]
            img, _ = preprocess_face(frame, detection, self.bbox2square, bbox_scale, self.face_alignment, oshape, False)
            imwrite_rgb('%s/%08d.%s' % (output_dir, ituple, ext), img)



[docs]class VideoTracking(FaceAnalyzer):
    """
    Video processing pipeline including face detection, tracking and classification
    Tracking is usually less costly than face detection (computation bottleneck)
    and allows to save computation time
    Classification decision functions and predictions are averaged for each
    tracked faces, allowing to obtain more robust analysis estimates
    """
[docs]    def __init__(self, detection_period, face_detector = None, face_classifier = None, batch_len=32, verbose = False):
        """
        Constructor

        Parameters
        ----------
        detection_period : int \
            the face detection algorithm (costly) will be used once every \
            'detection_period' analyzed frames. \
            Ie: if set to 5, face detection will occur for 1/5 frames and the
            remaining 4/5 faces will be detected through a tracking procedure
                if set to 1: face detection will occur for each frame. Face
                tracking will also be used for each frames, since it will allow
                to group same faces under a person identifier
        face_detector : instance of face_detector.FaceDetector or None, optional
            if None, LibFaceDetection is used. The default is None.
        face_classifier : instance of face_classifier.FaceClassifier or None, optional
            if None, Resnet50FairFaceGRA is used (gender & age). The default is None.
        verbose : boolean, optional
            If True, will display several usefull intermediate images and results.
            The default is False.
        """
        super().__init__(face_detector, face_classifier, batch_len=batch_len, verbose=verbose)
        self.detection_period = detection_period

[docs]    def __call__(self, video_path, fps = None,  offset = 0):
        """
        Pipeline function for face classification from videos with tracking

        Parameters:
            video_path: str
                Path to input video.
            fps: float or None (default)
                amount of video frames to process per seconds
                if set to None, all frames are processed (costly)
            offset: float (default: 0)
                Time in milliseconds to skip at the beginning of the video.

        Returns:
            Dataframe with frame and face information: frame position,
            coordinates, predictions, decision function,labels...
            faceid column allow to keep track of each unique face found
            predictions and decision functions with '_avg' suffix are obtained
            through a smoothing procedure of decision functions for all faces
            with same faceid. Smoothed estimates are usually more robust than
            instantaneous ones
        """
        detector = TrackerDetector(self.face_detector, self.detection_period)

        subsamp_coeff = 1 if fps is None else analysisFPS2subsamp_coeff(video_path, fps)
        stream = video_iterator(video_path, subsamp_coeff=subsamp_coeff, time_unit='ms', start=max(offset, 0), verbose=self.verbose)

        df = self._process_stream(stream, detector)

        return self.classifier.average_results(df)



class VideoPrecomputedDetection(FaceAnalyzer):
    """
    Video analysis class to be used combined with pre-detected face bounding
    boxes (uncommon use-case)
    CANNOT BE USED WITH RESCALED OR SQUARIFIED BOUNDING BOXES!!!
    FACE ALIGNMENT REQUIRE TO HAVE ORIGINAL BOUNDING BOX TO PERFORM WELL!!
    """
    def __init__(self, face_classifier = None, verbose = False, bbox_scale=None, bbox2square=None):
        """
        Constructor

        Parameters
        ----------
        face_classifier: instance of face_classifier.FaceClassifier or None
            if None, Resnet50FairFaceGRA is used by default (gender & age)
        verbose : boolean
            If True, will display several usefull intermediate images and results
        bbox_scale: float or None
            scaling factor to be applied to the face bounding box after detection
            if not None, will overrides classifier's bbox scaling instructions
            usefull if provided bounding boxes are already transformed - default None
        bbox2square: boolean or None
            set bounding box to the smallest square containing the bounding box
            if not None, will ovverides classifier's bbox2square instructions
            usefull if provided bounding boxes are already transformed - default None
        """

        super().__init__(PrecomputedDetector(), face_classifier, verbose=verbose)
        if bbox_scale is not None:
            self.bbox_scale = bbox_scale
        if bbox2square is not None:
            self.bbox2square = bbox2square

    def __call__(self, video_path, lbbox, fps=None, start_frame = 0):
        """
        Pipeline function for face classification from videos using pre-detected faces

        Parameters:
            video_path: str
                Path to input video.
            lbbox: list of bounding boxes
                Each list element i contain either a tuple or a list of tuples
                (x1,y1,x2,y2) corresponding to the face found in the ith frame
            fps: float or None (default)
                amount of video frames to process per seconds
                if set to None, all frames are processed (costly)
            offset: float (default: 0)
                Time in milliseconds to skip at the beginning of the video.

        Returns:
            Dataframe with frame and face information: frame position,
            coordinates, predictions, decision function,labels...
        """
        detector = PrecomputedDetector(lbbox)

        subsamp_coeff = 1 if fps is None else analysisFPS2subsamp_coeff(video_path, fps)
        stream = video_iterator(video_path, subsamp_coeff=subsamp_coeff, time_unit='frame', start=start_frame, verbose=self.verbose)
        df = self._process_stream(stream, detector)

        assert len(detector.lbbox) == 0, 'the detection list is longer than the number of processed frames'
        return df