Module `openpack_toolkit.data.dataloader.annotation`

Expand source code

from logging import getLogger
from pathlib import Path

import numpy as np
import pandas as pd

from openpack_toolkit.activity import ActSet
from openpack_toolkit.data.const import (
    CLASS_ID_KEY_NAME,
    END_ISO_TIMESTMAP_KEY_NAME,
    END_UNIX_TIME_KEY_NAME,
    NULL_OPERATION_CLASS_ID,
    START_ISO_TIMESTMAP_KEY_NAME,
    START_UNIX_TIME_KEY_NAME,
    TIMESTAMP_KEY_NAME,
)
from openpack_toolkit.utils.time import convert_iso_timestamp_to_unixttime

OPERATION_LABEL_KEY_NAME = "operation"
ACTION_LABEL_KEY_NAME = "action"

logger = getLogger(__name__)


def load_and_resample_annotation(
    path: Path,
    unixtimes_ms: np.ndarray,
    classes: ActSet,
    label_col: str = "id",
) -> pd.DataFrame:
    """Load annotation data and resample them according to unixtime sequence ``T``.
    If there are no annotation records for the given timestamp, that records is treated
    as NULL class.

    Args:
        path (Path): path to annotation CSV file.
        unixitmes (np.ndarray): unixtime seqeuence (milli-scond precision).
    Returns:
        pd.DataFrame: -
    """
    null_class_id = classes.get_ignore_class_id()
    if isinstance(null_class_id, tuple):
        null_class_id = null_class_id[-1]

    df = pd.read_csv(path)
    logger.debug(f"load annotation data from {path} -> df={df.shape}")
    ut_min, ut_max = df["unixtime"].min(), df["unixtime"].max()

    null_record = df.head(1).copy()
    null_record["unixtime"] = 0
    null_record["box"] = 0
    null_record[label_col] = null_class_id
    df = pd.concat([df, null_record], axis=0, ignore_index=True)

    # unixtime with second precision.
    unixtimes_sec = unixtimes_ms - (unixtimes_ms % 1000)
    # Assing 0 to non-annotated sequence.
    unixtimes_sec[unixtimes_sec < ut_min] = 0
    unixtimes_sec[unixtimes_sec > ut_max] = 0

    df = df.rename(columns={"unixtime": "annot_time"}).set_index("annot_time")
    df = df.loc[unixtimes_sec, :].reset_index(drop=False)
    df["unixtime"] = unixtimes_ms

    df["act_id"] = df[label_col]
    df["act_idx"] = classes.convert_id_to_index(df["act_id"].values)

    cols = ["unixtime", "annot_time", "user", "session", "box", "act_id", "act_idx"]
    return df[cols]


def load_and_resample_operation_labels(
    path: Path,
    unixtimes_ms: np.ndarray,
    classes: ActSet,
) -> pd.DataFrame:
    return load_and_resample_annotation(path, unixtimes_ms, classes, label_col="id")


def load_annotation_csv(path: Path) -> pd.DataFrame:
    """Load ground truth label CSV and convert start/end timestamp into unix time (millisecond precision)."""
    df = pd.read_csv(path)

    # Timestamps in annotation data are saved in ISO format (e.g., 2021-10-14 11:25:35.437000+09:00) for human
    # readability. So we have to convert them into unixtimestamp (milli-second precision) in advance.
    df[START_UNIX_TIME_KEY_NAME] = df[START_ISO_TIMESTMAP_KEY_NAME].apply(
        convert_iso_timestamp_to_unixttime
    )
    df[END_UNIX_TIME_KEY_NAME] = df[END_ISO_TIMESTMAP_KEY_NAME].apply(
        convert_iso_timestamp_to_unixttime
    )

    logger.info(f"Load annotation data from {path}")
    return df


def add_label_cols_to_dataframe(
    df_data: pd.DataFrame,
    df_label: pd.DataFrame,
    src_label_col_name: str = CLASS_ID_KEY_NAME,
    new_label_col_name: str = OPERATION_LABEL_KEY_NAME,
    null_label_class_id: int = NULL_OPERATION_CLASS_ID,
) -> pd.DataFrame:
    """Add label columns to the df_data.
    Default params are set to add the work operation labels.

    Args:
        df_data: DataFrame with unixtime for each record.
        df_labels: DataFrame of the work operation labels.
        src_label_col_name: column name of the label IDs in the df_label.
        new_label_col_name: new column name of the label IDs in the df_data.
        null_label_class_id: class ID for the null label.
    """
    assert TIMESTAMP_KEY_NAME in df_data.columns
    assert (START_UNIX_TIME_KEY_NAME in df_label.columns) and (
        END_UNIX_TIME_KEY_NAME in df_label.columns
    )

    df_data.insert(loc=1, column=new_label_col_name, value=null_label_class_id)
    for _, row in df_label.iterrows():
        timestamp_start = row[START_UNIX_TIME_KEY_NAME]
        timestamp_end = row[END_UNIX_TIME_KEY_NAME]
        operation_id = row[src_label_col_name]

        indices = df_data[
            (df_data[TIMESTAMP_KEY_NAME] >= timestamp_start)
            & (df_data[TIMESTAMP_KEY_NAME] < timestamp_end)
        ].index
        df_data.loc[indices, new_label_col_name] = operation_id
    return df_data

Functions

def add_label_cols_to_dataframe(df_data: pandas.core.frame.DataFrame, df_label: pandas.core.frame.DataFrame, src_label_col_name: str = 'id', new_label_col_name: str = 'operation', null_label_class_id: int = 8100) ‑> pandas.core.frame.DataFrame

Add label columns to the df_data. Default params are set to add the work operation labels.

Args

df_data: DataFrame with unixtime for each record.
df_labels: DataFrame of the work operation labels.
src_label_col_name: column name of the label IDs in the df_label.
new_label_col_name: new column name of the label IDs in the df_data.
null_label_class_id: class ID for the null label.

Expand source code

def add_label_cols_to_dataframe(
    df_data: pd.DataFrame,
    df_label: pd.DataFrame,
    src_label_col_name: str = CLASS_ID_KEY_NAME,
    new_label_col_name: str = OPERATION_LABEL_KEY_NAME,
    null_label_class_id: int = NULL_OPERATION_CLASS_ID,
) -> pd.DataFrame:
    """Add label columns to the df_data.
    Default params are set to add the work operation labels.

    Args:
        df_data: DataFrame with unixtime for each record.
        df_labels: DataFrame of the work operation labels.
        src_label_col_name: column name of the label IDs in the df_label.
        new_label_col_name: new column name of the label IDs in the df_data.
        null_label_class_id: class ID for the null label.
    """
    assert TIMESTAMP_KEY_NAME in df_data.columns
    assert (START_UNIX_TIME_KEY_NAME in df_label.columns) and (
        END_UNIX_TIME_KEY_NAME in df_label.columns
    )

    df_data.insert(loc=1, column=new_label_col_name, value=null_label_class_id)
    for _, row in df_label.iterrows():
        timestamp_start = row[START_UNIX_TIME_KEY_NAME]
        timestamp_end = row[END_UNIX_TIME_KEY_NAME]
        operation_id = row[src_label_col_name]

        indices = df_data[
            (df_data[TIMESTAMP_KEY_NAME] >= timestamp_start)
            & (df_data[TIMESTAMP_KEY_NAME] < timestamp_end)
        ].index
        df_data.loc[indices, new_label_col_name] = operation_id
    return df_data

def load_and_resample_annotation(path: pathlib.Path, unixtimes_ms: numpy.ndarray, classes: ActSet, label_col: str = 'id') ‑> pandas.core.frame.DataFrame

Load annotation data and resample them according to unixtime sequence T. If there are no annotation records for the given timestamp, that records is treated as NULL class.

Args

path : Path: path to annotation CSV file.
unixitmes : np.ndarray: unixtime seqeuence (milli-scond precision).

Returns

pd.DataFrame: -

Expand source code

def load_and_resample_annotation(
    path: Path,
    unixtimes_ms: np.ndarray,
    classes: ActSet,
    label_col: str = "id",
) -> pd.DataFrame:
    """Load annotation data and resample them according to unixtime sequence ``T``.
    If there are no annotation records for the given timestamp, that records is treated
    as NULL class.

    Args:
        path (Path): path to annotation CSV file.
        unixitmes (np.ndarray): unixtime seqeuence (milli-scond precision).
    Returns:
        pd.DataFrame: -
    """
    null_class_id = classes.get_ignore_class_id()
    if isinstance(null_class_id, tuple):
        null_class_id = null_class_id[-1]

    df = pd.read_csv(path)
    logger.debug(f"load annotation data from {path} -> df={df.shape}")
    ut_min, ut_max = df["unixtime"].min(), df["unixtime"].max()

    null_record = df.head(1).copy()
    null_record["unixtime"] = 0
    null_record["box"] = 0
    null_record[label_col] = null_class_id
    df = pd.concat([df, null_record], axis=0, ignore_index=True)

    # unixtime with second precision.
    unixtimes_sec = unixtimes_ms - (unixtimes_ms % 1000)
    # Assing 0 to non-annotated sequence.
    unixtimes_sec[unixtimes_sec < ut_min] = 0
    unixtimes_sec[unixtimes_sec > ut_max] = 0

    df = df.rename(columns={"unixtime": "annot_time"}).set_index("annot_time")
    df = df.loc[unixtimes_sec, :].reset_index(drop=False)
    df["unixtime"] = unixtimes_ms

    df["act_id"] = df[label_col]
    df["act_idx"] = classes.convert_id_to_index(df["act_id"].values)

    cols = ["unixtime", "annot_time", "user", "session", "box", "act_id", "act_idx"]
    return df[cols]

def load_and_resample_operation_labels(path: pathlib.Path, unixtimes_ms: numpy.ndarray, classes: ActSet) ‑> pandas.core.frame.DataFrame

Expand source code

def load_and_resample_operation_labels(
    path: Path,
    unixtimes_ms: np.ndarray,
    classes: ActSet,
) -> pd.DataFrame:
    return load_and_resample_annotation(path, unixtimes_ms, classes, label_col="id")

def load_annotation_csv(path: pathlib.Path) ‑> pandas.core.frame.DataFrame

Load ground truth label CSV and convert start/end timestamp into unix time (millisecond precision).

Expand source code

def load_annotation_csv(path: Path) -> pd.DataFrame:
    """Load ground truth label CSV and convert start/end timestamp into unix time (millisecond precision)."""
    df = pd.read_csv(path)

    # Timestamps in annotation data are saved in ISO format (e.g., 2021-10-14 11:25:35.437000+09:00) for human
    # readability. So we have to convert them into unixtimestamp (milli-second precision) in advance.
    df[START_UNIX_TIME_KEY_NAME] = df[START_ISO_TIMESTMAP_KEY_NAME].apply(
        convert_iso_timestamp_to_unixttime
    )
    df[END_UNIX_TIME_KEY_NAME] = df[END_ISO_TIMESTMAP_KEY_NAME].apply(
        convert_iso_timestamp_to_unixttime
    )

    logger.info(f"Load annotation data from {path}")
    return df