"""Validators for annotation files and datasets."""
import json
from collections.abc import Callable
from functools import wraps
from pathlib import Path
import pandas as pd
import pandera.pandas as pa
import xarray as xr
from attrs import define, field
from pandera.typing import Index
from ethology.io.annotations.json_schemas.utils import (
_check_file_is_json,
_check_file_matches_schema,
_check_required_keys_in_dict,
_get_default_schema,
)
[docs]
@define
class ValidVIA:
"""Class for valid VIA JSON annotation files.
It checks the input file is a valid JSON file, matches
the VIA schema and contains the required keys.
Attributes
----------
path : Path | str
Path to the VIA JSON file, passed as an input.
schema : dict
The JSON schema is set to the default VIA schema.
required_keys : dict
The required keys for the VIA JSON file.
Raises
------
ValueError
If the JSON file cannot be decoded.
jsonschema.exceptions.ValidationError
If the type of any of the keys in the JSON file
does not match the type specified in the schema.
jsonschema.exceptions.SchemaError
If the schema is invalid.
ValueError
If the VIA JSON file is missing any of the required keys.
"""
path: Path = field(converter=Path)
schema: dict = field(
default=_get_default_schema("VIA"),
init=False,
)
required_keys: dict = field(
default={
"main": ["_via_img_metadata", "_via_attributes"],
"images": ["filename"],
"regions": ["shape_attributes"],
"shape_attributes": ["x", "y", "width", "height"],
},
init=False,
# with init=False the attribute is always initialized
# with the default value
)
# Note: the validators are applied in order
@path.validator
def _file_is_json(self, attribute, value):
_check_file_is_json(value)
@path.validator
def _file_matches_JSON_schema(self, attribute, value):
_check_file_matches_schema(value, self.schema)
@path.validator
def _file_contains_required_keys(self, attribute, value):
"""Ensure that the VIA JSON file contains the required keys."""
# Read data as dict
with open(value) as file:
data = json.load(file)
# Check first level keys
_check_required_keys_in_dict(self.required_keys["main"], data)
# Check keys in nested dicts
for img_str, img_dict in data["_via_img_metadata"].items():
# Check keys for each image dictionary
_check_required_keys_in_dict(
self.required_keys["images"],
img_dict,
additional_message=f" for {img_str}",
)
# Check keys for each region in an image
for i, region in enumerate(img_dict["regions"]):
# Check keys under first level per region
_check_required_keys_in_dict(
self.required_keys["regions"],
region,
additional_message=f" for region {i} under {img_str}",
)
# Check keys under "shape_attributes" per region
_check_required_keys_in_dict(
self.required_keys["shape_attributes"],
region["shape_attributes"],
additional_message=f" for region {i} under {img_str}",
)
[docs]
@define
class ValidCOCO:
"""Class for valid COCO JSON annotation files.
It checks the input file is a valid JSON file, matches
the COCO schema and contains the required keys.
Attributes
----------
path : Path | str
Path to the COCO JSON file, passed as an input.
schema : dict
The JSON schema is set to the default COCO schema.
required_keys : dict
The required keys for the COCO JSON file.
Raises
------
ValueError
If the JSON file cannot be decoded.
jsonschema.exceptions.ValidationError
If the type of any of the keys in the JSON file
does not match the type specified in the schema.
jsonschema.exceptions.SchemaError
If the schema is invalid.
ValueError
If the COCO JSON file is missing any of the required keys.
"""
path: Path = field(converter=Path)
schema: dict = field(
default=_get_default_schema("COCO"),
init=False,
# with init=False the attribute is always initialized
# with the default value
)
# The keys of "required_keys" match the 1st level keys in a COCO JSON file
required_keys: dict = field(
default={
"main": ["images", "annotations", "categories"],
"images": ["id", "file_name", "width", "height"],
"annotations": ["id", "image_id", "bbox", "category_id"],
"categories": ["id", "name"], # exclude "supercategory"
},
init=False,
)
# Note: the validators are applied in order
@path.validator
def _file_is_json(self, attribute, value):
_check_file_is_json(value)
@path.validator
def _file_matches_JSON_schema(self, attribute, value):
_check_file_matches_schema(value, self.schema)
@path.validator
def _file_contains_required_keys(self, attribute, value):
"""Ensure that the COCO JSON file contains the required keys."""
# Helper function to singularise the input key for the
# error message
def _singularise_err_msg(key):
return key[:-1] if key != "categories" else key[:-3] + "y"
# Read file as dict
with open(value) as file:
data = json.load(file)
# Check first level keys
_check_required_keys_in_dict(self.required_keys["main"], data)
# Check keys in every dict listed under the "images", "annotations"
# and "categories" keys
for ky in list(self.required_keys.keys())[1:]:
for instance_dict in data[ky]:
_check_required_keys_in_dict(
self.required_keys[ky],
instance_dict,
additional_message=(
f" for {_singularise_err_msg(ky)} {instance_dict}"
),
)
@path.validator
def _file_contains_unique_image_IDs(self, attribute, value):
"""Ensure that the COCO JSON file contains unique image IDs.
When exporting to COCO format, the VIA tool attempts to extract the
image ID from the image filename using ``parseInt``. As a result, if
two or more images have the same number-based filename, the image IDs
can be non-unique (i.e., more image filenames than image IDs). This is
probably a bug in the VIA tool, but we need to check for this issue.
"""
with open(value) as file:
data = json.load(file)
# Get number of elements in "images" list
n_images = len(data["images"])
# Get the image IDs
unique_image_ids = set([img["id"] for img in data["images"]])
# Check for duplicate image IDs
if n_images != len(unique_image_ids):
raise ValueError(
"The image IDs in the input COCO file are not unique. "
f"There are {n_images} image entries, but only "
f"{len(unique_image_ids)} unique image IDs."
)
[docs]
@define
class ValidBboxesDataset:
"""Class for valid ``ethology`` bounding box annotations datasets.
It checks that the input dataset has:
- ``image_id``, ``space``, ``id`` as dimensions
- ``position`` and ``shape`` as data variables
Attributes
----------
dataset : xarray.Dataset
The xarray dataset to validate.
Raises
------
TypeError
If the input is not an xarray Dataset.
ValueError
If the dataset is missing required data variables or dimensions.
Notes
-----
The dataset can have other data variables and dimensions, but only the
required ones are checked.
"""
dataset: xr.Dataset = field()
# Minimum requirements for annotations datasets holding bboxes
required_dims: set = field(
default={"image_id", "space", "id"},
init=False,
)
required_data_vars: set = field(
default={"position", "shape"},
init=False,
)
@dataset.validator
def _check_dataset_type(self, attribute, value):
"""Ensure the input is an xarray Dataset."""
if not isinstance(value, xr.Dataset):
raise TypeError(
f"Expected an xarray Dataset, but got {type(value)}."
)
@dataset.validator
def _check_required_data_variables(self, attribute, value):
"""Ensure the dataset has all required data variables."""
missing_vars = self.required_data_vars - set(value.data_vars)
if missing_vars:
raise ValueError(
f"Missing required data variables: {sorted(missing_vars)}"
)
@dataset.validator
def _check_required_dimensions(self, attribute, value):
"""Ensure the dataset has all required dimensions."""
missing_dims = self.required_dims - set(value.dims)
if missing_dims:
raise ValueError(
f"Missing required dimensions: {sorted(missing_dims)}"
)
[docs]
class ValidBboxesDataFrame(pa.DataFrameModel):
"""Class for valid bounding boxes intermediate dataframes.
We use this dataframe internally as an intermediate step in the process of
converting an input bounding box annotations file (VIA or COCO) to
an ``ethology`` dataset. The validation checks all required columns
exist and their types are correct.
Attributes
----------
image_filename : str
Name of the image file.
image_id : int
Unique identifier for each of the images.
image_width : int
Width of each of the images, in the same units as the input file
(usually pixels).
image_height : int
Height of each of the images, in the same units as the input file
(usually pixels).
x_min : float
Minimum x-coordinate of the bounding box, in the same units as
the input file.
y_min : float
Minimum y-coordinate of the bounding box, in the same units as
the input file.
width : float
Width of the bounding box, in the same units as the input file.
height : float
Height of the bounding box, in the same units as the input file.
category_id : int
Unique identifier for the category, as specified in the input file.
A value of 0 is usually reserved for the background class.
category : str
Category of the annotation as a string.
supercategory : str
Supercategory of the annotation as a string.
Raises
------
pa.errors.SchemaError
If the input dataframe does not match the schema.
See Also
--------
:class:`pandera.api.pandas.model.DataFrameModel`
"""
# image columns
image_filename: str = pa.Field(description="Name of the image file.")
image_id: int = pa.Field(
description="Unique identifier for each of the images."
)
image_width: int = pa.Field(
description="Width of each of the images, "
"in the same units as the input file (usually pixels)."
# if not defined, it should be set to 0 in the df
)
image_height: int = pa.Field(
description="Height of each of the images, "
"in the same units as the input file (usually pixels)."
# if not defined, it should be set to 0 in the df
)
# bbox columns
x_min: float = pa.Field(
description=(
"Minimum x-coordinate of the bounding box, "
"in the same units as the input file."
)
)
y_min: float = pa.Field(
description=(
"Minimum y-coordinate of the bounding box, "
"in the same units as the input file."
)
)
width: float = pa.Field(
description=(
"Width of the bounding box, in the same units as the input file."
)
)
height: float = pa.Field(
description=(
"Height of the bounding box, in the same units as the input file."
)
)
# category columns
# - always defined in COCO files exported with VIA tool
# - optionally defined in VIA files exported with VIA tool
category_id: int = pa.Field(
description=(
"Unique identifier for the category, "
"as specified in the input file. A value of 0 "
"is usually reserved for the background class."
)
)
category: str = pa.Field(
description="Category of the annotation as a string."
)
supercategory: str = pa.Field(
description="Supercategory of the annotation as a string."
)
[docs]
@staticmethod
def get_empty_values() -> dict:
"""Get the default empty values for selected dataframe columns.
The columns are those that can be undefined in VIA and COCO files:
``category``, ``supercategory``, ``category_id``, ``image_width`` and
``image_height``.
Returns
-------
dict
A dictionary with the default empty values the specified columns.
"""
return {
"category": "", # it can be undefined in VIA files
"supercategory": "", # it can be undefined in VIA and COCO files
"category_id": -1, # it can be undefined in VIA files
"image_width": 0, # it can be undefined in VIA files
"image_height": 0, # it can be undefined in VIA files
}
[docs]
class ValidBboxesDataFrameCOCO(pa.DataFrameModel):
"""Class for COCO-exportable bounding box annotations dataframes.
The validation checks the required columns exist and their types are
correct. It additionally checks that the index and the
``annotation_id`` column are equal.
Attributes
----------
idx : Index[int]
Index of the dataframe. Should be greater than or equal to 0 and equal
to the ``annotation_id`` column.
annotation_id : int
Unique identifier for the annotation. Should be equal to the index.
image_id : int
Unique identifier for each of the images.
image_filename : str
Filename of the image.
image_width : int
Width of each of the images.
image_height : int
Height of each of the images.
bbox : list[float]
Bounding box coordinates as xmin, ymin, width, height.
area : float
Bounding box area.
segmentation : list[list[float]]
Bounding box segmentation masks as list of lists of coordinates.
category : str
Category of the annotation.
supercategory : str
Supercategory of the annotation.
iscrowd : int
Whether the annotation is a crowd. Should be 0 or 1.
Raises
------
pa.errors.SchemaError
If the dataframe does not match the schema.
Notes
-----
See `COCO format documentation <https://cocodataset.org/#format-data>`_
for more details.
See Also
--------
:class:`pandera.api.pandas.model.DataFrameModel`
"""
# index
idx: Index[int] = pa.Field(ge=0, check_name=False)
# annotation_id
annotation_id: int = pa.Field(
description="Unique identifier for the annotation (index)",
)
# image columns
image_id: int = pa.Field(
description="Unique identifier for the image",
)
image_filename: str = pa.Field(
description="Filename of the image",
)
image_width: int = pa.Field(
description="Width of the image", ge=0, nullable=True
)
image_height: int = pa.Field(
description="Height of the image", ge=0, nullable=True
)
# bbox data
bbox: list[float] = pa.Field(
description="Bounding box coordinates as xmin, ymin, width, height"
)
area: float = pa.Field(
description="Bounding box area",
ge=0,
)
segmentation: list[list[float]] = pa.Field(
description="Bounding box segmentation as list of lists of coordinates"
)
# category columns
# we do not require supercategories to be present in the
# dataframe since they are not currently added to the xarray dataset
category: str = pa.Field(
description="Category of the annotation",
nullable=True,
)
# other
iscrowd: int = pa.Field(
description="Whether the annotation is a crowd",
isin=[0, 1],
nullable=True,
)
[docs]
@staticmethod
def map_df_columns_to_COCO_fields() -> dict:
"""Map COCO-exportable dataframe columns to COCO fields.
Returns
-------
dict
A dictionary mapping each column in the COCO-exportable dataframe
to the corresponding fields in the equivalent COCO file.
"""
return {
"images": {
"image_id": "id",
"image_filename": "file_name",
"image_width": "width",
"image_height": "height",
},
"categories": {
"category_id": "id",
"category": "name",
"supercategory": "supercategory",
},
"annotations": {
"annotation_id": "id",
"area": "area",
"bbox": "bbox",
"image_id": "image_id",
"category_id": "category_id",
"iscrowd": "iscrowd",
"segmentation": "segmentation",
},
}
[docs]
@pa.dataframe_check
def check_idx_and_annotation_id(cls, df: pd.DataFrame) -> bool:
"""Check that the index and the ``annotation_id`` column are equal.
Parameters
----------
df : pd.DataFrame
The dataframe to check.
Returns
-------
bool
A boolean indicating whether the index and the
``annotation_id`` column are equal for all rows.
"""
return all(df.index == df["annotation_id"])
def _check_output(validator: type):
"""Return a decorator that validates the output of a function."""
def decorator(function: Callable) -> Callable:
@wraps(function) # to preserve function metadata
def wrapper(*args, **kwargs):
result = function(*args, **kwargs)
validator(result)
return result
return wrapper
return decorator
def _check_input(validator: type, input_index: int = 0):
"""Return a decorator that validates a specific input of a function.
By default, the first input is validated. If the input index is
larger than the number of inputs, no validation is performed.
"""
def decorator(function: Callable) -> Callable:
@wraps(function)
def wrapper(*args, **kwargs):
if len(args) > input_index:
validator(args[input_index])
result = function(*args, **kwargs)
return result
return wrapper
return decorator