Source code for ethology.io.annotations.validate

"""Validators for annotation files and datasets."""

import json
from collections.abc import Callable
from functools import wraps
from pathlib import Path

import pandas as pd
import pandera.pandas as pa
import xarray as xr
from attrs import define, field
from pandera.typing import Index

from ethology.io.annotations.json_schemas.utils import (
    _check_file_is_json,
    _check_file_matches_schema,
    _check_required_keys_in_dict,
    _get_default_schema,
)



[docs]
@define
class ValidVIA:
    """Class for valid VIA JSON annotation files.

    It checks the input file is a valid JSON file, matches
    the VIA schema and contains the required keys.


    Attributes
    ----------
    path : Path | str
        Path to the VIA JSON file, passed as an input.
    schema : dict
        The JSON schema is set to the default VIA schema.
    required_keys : dict
        The required keys for the VIA JSON file.

    Raises
    ------
    ValueError
        If the JSON file cannot be decoded.
    jsonschema.exceptions.ValidationError
        If the type of any of the keys in the JSON file
        does not match the type specified in the schema.
    jsonschema.exceptions.SchemaError
        If the schema is invalid.
    ValueError
        If the VIA JSON file is missing any of the required keys.

    """

    path: Path = field(converter=Path)
    schema: dict = field(
        default=_get_default_schema("VIA"),
        init=False,
    )
    required_keys: dict = field(
        default={
            "main": ["_via_img_metadata", "_via_attributes"],
            "images": ["filename"],
            "regions": ["shape_attributes"],
            "shape_attributes": ["x", "y", "width", "height"],
        },
        init=False,
        # with init=False the attribute is always initialized
        # with the default value
    )

    # Note: the validators are applied in order
    @path.validator
    def _file_is_json(self, attribute, value):
        _check_file_is_json(value)

    @path.validator
    def _file_matches_JSON_schema(self, attribute, value):
        _check_file_matches_schema(value, self.schema)

    @path.validator
    def _file_contains_required_keys(self, attribute, value):
        """Ensure that the VIA JSON file contains the required keys."""
        # Read data as dict
        with open(value) as file:
            data = json.load(file)

        # Check first level keys
        _check_required_keys_in_dict(self.required_keys["main"], data)

        # Check keys in nested dicts
        for img_str, img_dict in data["_via_img_metadata"].items():
            # Check keys for each image dictionary
            _check_required_keys_in_dict(
                self.required_keys["images"],
                img_dict,
                additional_message=f" for {img_str}",
            )

            # Check keys for each region in an image
            for i, region in enumerate(img_dict["regions"]):
                # Check keys under first level per region
                _check_required_keys_in_dict(
                    self.required_keys["regions"],
                    region,
                    additional_message=f" for region {i} under {img_str}",
                )

                # Check keys under "shape_attributes" per region
                _check_required_keys_in_dict(
                    self.required_keys["shape_attributes"],
                    region["shape_attributes"],
                    additional_message=f" for region {i} under {img_str}",
                )




[docs]
@define
class ValidCOCO:
    """Class for valid COCO JSON annotation files.

    It checks the input file is a valid JSON file, matches
    the COCO schema and contains the required keys.

    Attributes
    ----------
    path : Path | str
        Path to the COCO JSON file, passed as an input.
    schema : dict
        The JSON schema is set to the default COCO schema.
    required_keys : dict
        The required keys for the COCO JSON file.

    Raises
    ------
    ValueError
        If the JSON file cannot be decoded.
    jsonschema.exceptions.ValidationError
        If the type of any of the keys in the JSON file
        does not match the type specified in the schema.
    jsonschema.exceptions.SchemaError
        If the schema is invalid.
    ValueError
        If the COCO JSON file is missing any of the required keys.

    """

    path: Path = field(converter=Path)
    schema: dict = field(
        default=_get_default_schema("COCO"),
        init=False,
        # with init=False the attribute is always initialized
        # with the default value
    )

    # The keys of "required_keys" match the 1st level keys in a COCO JSON file
    required_keys: dict = field(
        default={
            "main": ["images", "annotations", "categories"],
            "images": ["id", "file_name", "width", "height"],
            "annotations": ["id", "image_id", "bbox", "category_id"],
            "categories": ["id", "name"],  # exclude "supercategory"
        },
        init=False,
    )

    # Note: the validators are applied in order
    @path.validator
    def _file_is_json(self, attribute, value):
        _check_file_is_json(value)

    @path.validator
    def _file_matches_JSON_schema(self, attribute, value):
        _check_file_matches_schema(value, self.schema)

    @path.validator
    def _file_contains_required_keys(self, attribute, value):
        """Ensure that the COCO JSON file contains the required keys."""

        # Helper function to singularise the input key for the
        # error message
        def _singularise_err_msg(key):
            return key[:-1] if key != "categories" else key[:-3] + "y"

        # Read file as dict
        with open(value) as file:
            data = json.load(file)

        # Check first level keys
        _check_required_keys_in_dict(self.required_keys["main"], data)

        # Check keys in every dict listed under the "images", "annotations"
        # and "categories" keys
        for ky in list(self.required_keys.keys())[1:]:
            for instance_dict in data[ky]:
                _check_required_keys_in_dict(
                    self.required_keys[ky],
                    instance_dict,
                    additional_message=(
                        f" for {_singularise_err_msg(ky)} {instance_dict}"
                    ),
                )

    @path.validator
    def _file_contains_unique_image_IDs(self, attribute, value):
        """Ensure that the COCO JSON file contains unique image IDs.

        When exporting to COCO format, the VIA tool attempts to extract the
        image ID from the image filename using ``parseInt``. As a result, if
        two or more images have the same number-based filename, the image IDs
        can be non-unique (i.e., more image filenames than image IDs). This is
        probably a bug in the VIA tool, but we need to check for this issue.
        """
        with open(value) as file:
            data = json.load(file)

        # Get number of elements in "images" list
        n_images = len(data["images"])

        # Get the image IDs
        unique_image_ids = set([img["id"] for img in data["images"]])

        # Check for duplicate image IDs
        if n_images != len(unique_image_ids):
            raise ValueError(
                "The image IDs in the input COCO file are not unique. "
                f"There are {n_images} image entries, but only "
                f"{len(unique_image_ids)} unique image IDs."
            )




[docs]
@define
class ValidBboxesDataset:
    """Class for valid ``ethology`` bounding box annotations datasets.

    It checks that the input dataset has:

    - ``image_id``, ``space``, ``id`` as dimensions
    - ``position`` and ``shape`` as data variables

    Attributes
    ----------
    dataset : xarray.Dataset
        The xarray dataset to validate.

    Raises
    ------
    TypeError
        If the input is not an xarray Dataset.
    ValueError
        If the dataset is missing required data variables or dimensions.

    Notes
    -----
    The dataset can have other data variables and dimensions, but only the
    required ones are checked.

    """

    dataset: xr.Dataset = field()

    # Minimum requirements for annotations datasets holding bboxes
    required_dims: set = field(
        default={"image_id", "space", "id"},
        init=False,
    )
    required_data_vars: set = field(
        default={"position", "shape"},
        init=False,
    )

    @dataset.validator
    def _check_dataset_type(self, attribute, value):
        """Ensure the input is an xarray Dataset."""
        if not isinstance(value, xr.Dataset):
            raise TypeError(
                f"Expected an xarray Dataset, but got {type(value)}."
            )

    @dataset.validator
    def _check_required_data_variables(self, attribute, value):
        """Ensure the dataset has all required data variables."""
        missing_vars = self.required_data_vars - set(value.data_vars)
        if missing_vars:
            raise ValueError(
                f"Missing required data variables: {sorted(missing_vars)}"
            )

    @dataset.validator
    def _check_required_dimensions(self, attribute, value):
        """Ensure the dataset has all required dimensions."""
        missing_dims = self.required_dims - set(value.dims)
        if missing_dims:
            raise ValueError(
                f"Missing required dimensions: {sorted(missing_dims)}"
            )




[docs]
class ValidBboxesDataFrame(pa.DataFrameModel):
    """Class for valid bounding boxes intermediate dataframes.

    We use this dataframe internally as an intermediate step in the process of
    converting an input bounding box annotations file (VIA or COCO) to
    an ``ethology`` dataset. The validation checks all required columns
    exist and their types are correct.

    Attributes
    ----------
    image_filename : str
        Name of the image file.
    image_id : int
        Unique identifier for each of the images.
    image_width : int
        Width of each of the images, in the same units as the input file
        (usually pixels).
    image_height : int
        Height of each of the images, in the same units as the input file
        (usually pixels).
    x_min : float
        Minimum x-coordinate of the bounding box, in the same units as
        the input file.
    y_min : float
        Minimum y-coordinate of the bounding box, in the same units as
        the input file.
    width : float
        Width of the bounding box, in the same units as the input file.
    height : float
        Height of the bounding box, in the same units as the input file.
    category_id : int
        Unique identifier for the category, as specified in the input file.
        A value of 0 is usually reserved for the background class.
    category : str
        Category of the annotation as a string.
    supercategory : str
        Supercategory of the annotation as a string.

    Raises
    ------
    pa.errors.SchemaError
        If the input dataframe does not match the schema.

    See Also
    --------
    :class:`pandera.api.pandas.model.DataFrameModel`

    """

    # image columns
    image_filename: str = pa.Field(description="Name of the image file.")
    image_id: int = pa.Field(
        description="Unique identifier for each of the images."
    )
    image_width: int = pa.Field(
        description="Width of each of the images, "
        "in the same units as the input file (usually pixels)."
        # if not defined, it should be set to 0 in the df
    )
    image_height: int = pa.Field(
        description="Height of each of the images, "
        "in the same units as the input file (usually pixels)."
        # if not defined, it should be set to 0 in the df
    )

    # bbox columns
    x_min: float = pa.Field(
        description=(
            "Minimum x-coordinate of the bounding box, "
            "in the same units as the input file."
        )
    )
    y_min: float = pa.Field(
        description=(
            "Minimum y-coordinate of the bounding box, "
            "in the same units as the input file."
        )
    )
    width: float = pa.Field(
        description=(
            "Width of the bounding box, in the same units as the input file."
        )
    )
    height: float = pa.Field(
        description=(
            "Height of the bounding box, in the same units as the input file."
        )
    )

    # category columns
    # - always defined in COCO files exported with VIA tool
    # - optionally defined in VIA files exported with VIA tool
    category_id: int = pa.Field(
        description=(
            "Unique identifier for the category, "
            "as specified in the input file. A value of 0 "
            "is usually reserved for the background class."
        )
    )
    category: str = pa.Field(
        description="Category of the annotation as a string."
    )
    supercategory: str = pa.Field(
        description="Supercategory of the annotation as a string."
    )


[docs]
    @staticmethod
    def get_empty_values() -> dict:
        """Get the default empty values for selected dataframe columns.

        The columns are those that can be undefined in VIA and COCO files:
        ``category``, ``supercategory``, ``category_id``, ``image_width`` and
        ``image_height``.

        Returns
        -------
        dict
            A dictionary with the default empty values the specified columns.

        """
        return {
            "category": "",  # it can be undefined in VIA files
            "supercategory": "",  # it can be undefined in VIA and COCO files
            "category_id": -1,  # it can be undefined in VIA files
            "image_width": 0,  # it can be undefined in VIA files
            "image_height": 0,  # it can be undefined in VIA files
        }





[docs]
class ValidBboxesDataFrameCOCO(pa.DataFrameModel):
    """Class for COCO-exportable bounding box annotations dataframes.

    The validation checks the required columns exist and their types are
    correct. It additionally checks that the index and the
    ``annotation_id`` column are equal.

    Attributes
    ----------
    idx : Index[int]
        Index of the dataframe. Should be greater than or equal to 0 and equal
        to the ``annotation_id`` column.
    annotation_id : int
        Unique identifier for the annotation. Should be equal to the index.
    image_id : int
        Unique identifier for each of the images.
    image_filename : str
        Filename of the image.
    image_width : int
        Width of each of the images.
    image_height : int
        Height of each of the images.
    bbox : list[float]
        Bounding box coordinates as xmin, ymin, width, height.
    area : float
        Bounding box area.
    segmentation : list[list[float]]
        Bounding box segmentation masks as list of lists of coordinates.
    category : str
        Category of the annotation.
    supercategory : str
        Supercategory of the annotation.
    iscrowd : int
        Whether the annotation is a crowd. Should be 0 or 1.

    Raises
    ------
    pa.errors.SchemaError
        If the dataframe does not match the schema.

    Notes
    -----
    See `COCO format documentation <https://cocodataset.org/#format-data>`_
    for more details.

    See Also
    --------
    :class:`pandera.api.pandas.model.DataFrameModel`

    """

    # index
    idx: Index[int] = pa.Field(ge=0, check_name=False)

    # annotation_id
    annotation_id: int = pa.Field(
        description="Unique identifier for the annotation (index)",
    )

    # image columns
    image_id: int = pa.Field(
        description="Unique identifier for the image",
    )
    image_filename: str = pa.Field(
        description="Filename of the image",
    )
    image_width: int = pa.Field(
        description="Width of the image", ge=0, nullable=True
    )
    image_height: int = pa.Field(
        description="Height of the image", ge=0, nullable=True
    )

    # bbox data
    bbox: list[float] = pa.Field(
        description="Bounding box coordinates as xmin, ymin, width, height"
    )
    area: float = pa.Field(
        description="Bounding box area",
        ge=0,
    )
    segmentation: list[list[float]] = pa.Field(
        description="Bounding box segmentation as list of lists of coordinates"
    )

    # category columns
    # we do not require supercategories to be present in the
    # dataframe since they are not currently added to the xarray dataset
    category: str = pa.Field(
        description="Category of the annotation",
        nullable=True,
    )

    # other
    iscrowd: int = pa.Field(
        description="Whether the annotation is a crowd",
        isin=[0, 1],
        nullable=True,
    )


[docs]
    @staticmethod
    def map_df_columns_to_COCO_fields() -> dict:
        """Map COCO-exportable dataframe columns to COCO fields.

        Returns
        -------
        dict
            A dictionary mapping each column in the COCO-exportable dataframe
            to the corresponding fields in the equivalent COCO file.

        """
        return {
            "images": {
                "image_id": "id",
                "image_filename": "file_name",
                "image_width": "width",
                "image_height": "height",
            },
            "categories": {
                "category_id": "id",
                "category": "name",
                "supercategory": "supercategory",
            },
            "annotations": {
                "annotation_id": "id",
                "area": "area",
                "bbox": "bbox",
                "image_id": "image_id",
                "category_id": "category_id",
                "iscrowd": "iscrowd",
                "segmentation": "segmentation",
            },
        }



[docs]
    @pa.dataframe_check
    def check_idx_and_annotation_id(cls, df: pd.DataFrame) -> bool:
        """Check that the index and the ``annotation_id`` column are equal.

        Parameters
        ----------
        df : pd.DataFrame
            The dataframe to check.

        Returns
        -------
        bool
            A boolean indicating whether the index and the
            ``annotation_id`` column are equal for all rows.

        """
        return all(df.index == df["annotation_id"])




def _check_output(validator: type):
    """Return a decorator that validates the output of a function."""

    def decorator(function: Callable) -> Callable:
        @wraps(function)  # to preserve function metadata
        def wrapper(*args, **kwargs):
            result = function(*args, **kwargs)
            validator(result)
            return result

        return wrapper

    return decorator


def _check_input(validator: type, input_index: int = 0):
    """Return a decorator that validates a specific input of a function.

    By default, the first input is validated. If the input index is
    larger than the number of inputs, no validation is performed.
    """

    def decorator(function: Callable) -> Callable:
        @wraps(function)
        def wrapper(*args, **kwargs):
            if len(args) > input_index:
                validator(args[input_index])
            result = function(*args, **kwargs)
            return result

        return wrapper

    return decorator