Source code for ethology.annotations.io.save_bboxes

"""Module for exporting manually labelled bounding boxes."""

import json
from datetime import datetime
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
import pytz

from ethology.annotations.io.load_bboxes import STANDARD_BBOXES_DF_INDEX
from ethology.annotations.validators import ValidCOCO

# Mapping of dataframe columns to COCO keys
STANDARD_BBOXES_DF_COLUMNS_TO_COCO = {
    "images": {
        "image_id": "id",
        "image_filename": "file_name",
        "image_width": "width",
        "image_height": "height",
    },
    "categories": {
        "category_id": "id",
        "category": "name",
        "supercategory": "supercategory",
    },
    "annotations": {
        "annotation_id": "id",
        "area": "area",
        "bbox": "bbox",
        "image_id": "image_id",
        "category_id": "category_id",
        "iscrowd": "iscrowd",
        "segmentation": "segmentation",
    },
}


def _validate_df_bboxes(df: pd.DataFrame):
    """Check if the input dataframe is a valid bounding boxes dataframe."""
    # Check type
    if not isinstance(df, pd.DataFrame):
        raise TypeError(f"Expected a pandas DataFrame, but got {type(df)}.")

    # Check index name is as expected
    if df.index.name != STANDARD_BBOXES_DF_INDEX:
        raise ValueError(
            f"Expected index name to be '{STANDARD_BBOXES_DF_INDEX}', "
            f"but got '{df.index.name}'."
        )

    # Check bboxes coordinates exist as df columns
    if not all(x in df.columns for x in ["x_min", "y_min", "width", "height"]):
        raise ValueError(
            "Required bounding box coordinates "
            "'x_min', 'y_min', 'width', 'height', are not present in "
            "the dataframe."
        )


def _fill_in_COCO_required_data(df: pd.DataFrame) -> pd.DataFrame:
    """Return the bboxes input dataframe with any COCO required data added."""
    # Add annotation_id as column
    df["annotation_id"] = df.index

    # Add COCO required data
    if "category" not in df.columns:
        df["category"] = ""  # if not defined: set as empty string

    if "category_id" not in df.columns or df["category_id"].dtype != int:
        df["category_id"] = df["category"].factorize(sort=True)[0]

    if "area" not in df.columns:
        df["area"] = df["width"] * df["height"]

    if "iscrowd" not in df.columns:
        df["iscrowd"] = 0  # if not defined: assume default value

    if "segmentation" not in df.columns:
        # If not defined: assume default value for iscrowd=0
        # Default is a polygon defined by the 4 corners of the bounding box

        # Compute 4 corners of the bounding box
        top_left_corner = df[["x_min", "y_min"]].to_numpy()
        delta_xy = df[["width", "height"]].to_numpy()
        delta_x_only = np.vstack([df["width"], np.zeros_like(df["height"])]).T
        delta_y_only = np.vstack([np.zeros_like(df["width"]), df["height"]]).T

        # Combine all xy coordinates of corners into one column
        df["segmentation"] = np.hstack(
            [
                top_left_corner,
                top_left_corner + delta_x_only,  # top right corner
                top_left_corner + delta_xy,  # bottom right corner
                top_left_corner + delta_y_only,  # bottom left corner
            ]
        ).tolist()

        # Wrap in a list of lists, to match VIA format
        df["segmentation"] = df["segmentation"].apply(lambda x: [x])

    if "bbox" not in df.columns:
        df["bbox"] = (
            df[["x_min", "y_min", "width", "height"]].to_numpy().tolist()
        )

    return df


def _create_COCO_dict(df: pd.DataFrame) -> dict:
    """Extract COCO dictionary from a bounding boxes dataframe."""
    COCO_dict: dict[str, Any] = {}
    for sections in ["images", "categories", "annotations"]:
        # Extract required columns
        df_section = df[
            list(STANDARD_BBOXES_DF_COLUMNS_TO_COCO[sections].keys())
        ].copy()

        # Rename columns to COCO standard
        df_section = df_section.rename(
            columns=STANDARD_BBOXES_DF_COLUMNS_TO_COCO[sections]
        )

        # Extract rows as lists of dictionaries
        if sections == "annotations":
            row_dicts = df_section.to_dict(orient="records")
        else:
            row_dicts = df_section.drop_duplicates().to_dict(orient="records")

        # Append to COCO_dict
        COCO_dict[sections] = row_dicts

    # Add info section to COCO_dict
    COCO_dict["info"] = {
        "date_created": datetime.now(pytz.utc).strftime(
            "%a %b %d %Y %H:%M:%S GMT%z"
        ),
        "description": "Bounding boxes annotations exported from ethology",
        "url": "https://github.com/neuroinformatics-unit/ethology",
    }

    return COCO_dict



[docs]
def to_COCO_file(df: pd.DataFrame, output_filepath: str | Path):
    """Write bounding boxes annotations to a COCO JSON file.

    Parameters
    ----------
    df : pd.DataFrame
        Bounding boxes annotations dataframe.
    output_filepath : str or Path
        Output file path.

    Returns
    -------
    output_filepath : str
        Output file path.

    """
    # Validate input dataframe
    _validate_df_bboxes(df)

    # Sort, drop duplicate annotations and reindex
    df = df.sort_values(by=["image_filename"])
    df = df.drop_duplicates(ignore_index=True)

    # Fill in COCO required data
    df = _fill_in_COCO_required_data(df)

    # Create COCO dictionary
    COCO_dict = _create_COCO_dict(df)

    # Write to JSON file
    with open(output_filepath, "w") as f:
        json.dump(COCO_dict, f, sort_keys=True, indent=2)

    # Check if output file is a valid COCO for ethology
    ValidCOCO(output_filepath)

    return output_filepath