Module openpack_toolkit.data.dataloader.annotation
Expand source code
from logging import getLogger
from pathlib import Path
import numpy as np
import pandas as pd
from openpack_toolkit.activity import ActSet
from openpack_toolkit.data.const import (
    CLASS_ID_KEY_NAME,
    END_ISO_TIMESTMAP_KEY_NAME,
    END_UNIX_TIME_KEY_NAME,
    NULL_OPERATION_CLASS_ID,
    START_ISO_TIMESTMAP_KEY_NAME,
    START_UNIX_TIME_KEY_NAME,
    TIMESTAMP_KEY_NAME,
)
from openpack_toolkit.utils.time import convert_iso_timestamp_to_unixttime
OPERATION_LABEL_KEY_NAME = "operation"
ACTION_LABEL_KEY_NAME = "action"
logger = getLogger(__name__)
def load_and_resample_annotation(
    path: Path,
    unixtimes_ms: np.ndarray,
    classes: ActSet,
    label_col: str = "id",
) -> pd.DataFrame:
    """Load annotation data and resample them according to unixtime sequence ``T``.
    If there are no annotation records for the given timestamp, that records is treated
    as NULL class.
    Args:
        path (Path): path to annotation CSV file.
        unixitmes (np.ndarray): unixtime seqeuence (milli-scond precision).
    Returns:
        pd.DataFrame: -
    """
    null_class_id = classes.get_ignore_class_id()
    if isinstance(null_class_id, tuple):
        null_class_id = null_class_id[-1]
    df = pd.read_csv(path)
    logger.debug(f"load annotation data from {path} -> df={df.shape}")
    ut_min, ut_max = df["unixtime"].min(), df["unixtime"].max()
    null_record = df.head(1).copy()
    null_record["unixtime"] = 0
    null_record["box"] = 0
    null_record[label_col] = null_class_id
    df = pd.concat([df, null_record], axis=0, ignore_index=True)
    # unixtime with second precision.
    unixtimes_sec = unixtimes_ms - (unixtimes_ms % 1000)
    # Assing 0 to non-annotated sequence.
    unixtimes_sec[unixtimes_sec < ut_min] = 0
    unixtimes_sec[unixtimes_sec > ut_max] = 0
    df = df.rename(columns={"unixtime": "annot_time"}).set_index("annot_time")
    df = df.loc[unixtimes_sec, :].reset_index(drop=False)
    df["unixtime"] = unixtimes_ms
    df["act_id"] = df[label_col]
    df["act_idx"] = classes.convert_id_to_index(df["act_id"].values)
    cols = ["unixtime", "annot_time", "user", "session", "box", "act_id", "act_idx"]
    return df[cols]
def load_and_resample_operation_labels(
    path: Path,
    unixtimes_ms: np.ndarray,
    classes: ActSet,
) -> pd.DataFrame:
    return load_and_resample_annotation(path, unixtimes_ms, classes, label_col="id")
def load_annotation_csv(path: Path) -> pd.DataFrame:
    """Load ground truth label CSV and convert start/end timestamp into unix time (millisecond precision)."""
    df = pd.read_csv(path)
    # Timestamps in annotation data are saved in ISO format (e.g., 2021-10-14 11:25:35.437000+09:00) for human
    # readability. So we have to convert them into unixtimestamp (milli-second precision) in advance.
    df[START_UNIX_TIME_KEY_NAME] = df[START_ISO_TIMESTMAP_KEY_NAME].apply(
        convert_iso_timestamp_to_unixttime
    )
    df[END_UNIX_TIME_KEY_NAME] = df[END_ISO_TIMESTMAP_KEY_NAME].apply(
        convert_iso_timestamp_to_unixttime
    )
    logger.info(f"Load annotation data from {path}")
    return df
def add_label_cols_to_dataframe(
    df_data: pd.DataFrame,
    df_label: pd.DataFrame,
    src_label_col_name: str = CLASS_ID_KEY_NAME,
    new_label_col_name: str = OPERATION_LABEL_KEY_NAME,
    null_label_class_id: int = NULL_OPERATION_CLASS_ID,
) -> pd.DataFrame:
    """Add label columns to the df_data.
    Default params are set to add the work operation labels.
    Args:
        df_data: DataFrame with unixtime for each record.
        df_labels: DataFrame of the work operation labels.
        src_label_col_name: column name of the label IDs in the df_label.
        new_label_col_name: new column name of the label IDs in the df_data.
        null_label_class_id: class ID for the null label.
    """
    assert TIMESTAMP_KEY_NAME in df_data.columns
    assert (START_UNIX_TIME_KEY_NAME in df_label.columns) and (
        END_UNIX_TIME_KEY_NAME in df_label.columns
    )
    df_data.insert(loc=1, column=new_label_col_name, value=null_label_class_id)
    for _, row in df_label.iterrows():
        timestamp_start = row[START_UNIX_TIME_KEY_NAME]
        timestamp_end = row[END_UNIX_TIME_KEY_NAME]
        operation_id = row[src_label_col_name]
        indices = df_data[
            (df_data[TIMESTAMP_KEY_NAME] >= timestamp_start)
            & (df_data[TIMESTAMP_KEY_NAME] < timestamp_end)
        ].index
        df_data.loc[indices, new_label_col_name] = operation_id
    return df_data
Functions
def add_label_cols_to_dataframe(df_data: pandas.core.frame.DataFrame, df_label: pandas.core.frame.DataFrame, src_label_col_name: str = 'id', new_label_col_name: str = 'operation', null_label_class_id: int = 8100) ‑> pandas.core.frame.DataFrame- 
Add label columns to the df_data. Default params are set to add the work operation labels.
Args
df_data- DataFrame with unixtime for each record.
 df_labels- DataFrame of the work operation labels.
 src_label_col_name- column name of the label IDs in the df_label.
 new_label_col_name- new column name of the label IDs in the df_data.
 null_label_class_id- class ID for the null label.
 
Expand source code
def add_label_cols_to_dataframe( df_data: pd.DataFrame, df_label: pd.DataFrame, src_label_col_name: str = CLASS_ID_KEY_NAME, new_label_col_name: str = OPERATION_LABEL_KEY_NAME, null_label_class_id: int = NULL_OPERATION_CLASS_ID, ) -> pd.DataFrame: """Add label columns to the df_data. Default params are set to add the work operation labels. Args: df_data: DataFrame with unixtime for each record. df_labels: DataFrame of the work operation labels. src_label_col_name: column name of the label IDs in the df_label. new_label_col_name: new column name of the label IDs in the df_data. null_label_class_id: class ID for the null label. """ assert TIMESTAMP_KEY_NAME in df_data.columns assert (START_UNIX_TIME_KEY_NAME in df_label.columns) and ( END_UNIX_TIME_KEY_NAME in df_label.columns ) df_data.insert(loc=1, column=new_label_col_name, value=null_label_class_id) for _, row in df_label.iterrows(): timestamp_start = row[START_UNIX_TIME_KEY_NAME] timestamp_end = row[END_UNIX_TIME_KEY_NAME] operation_id = row[src_label_col_name] indices = df_data[ (df_data[TIMESTAMP_KEY_NAME] >= timestamp_start) & (df_data[TIMESTAMP_KEY_NAME] < timestamp_end) ].index df_data.loc[indices, new_label_col_name] = operation_id return df_data def load_and_resample_annotation(path: pathlib.Path, unixtimes_ms: numpy.ndarray, classes: ActSet, label_col: str = 'id') ‑> pandas.core.frame.DataFrame- 
Load annotation data and resample them according to unixtime sequence
T. If there are no annotation records for the given timestamp, that records is treated as NULL class.Args
path:Path- path to annotation CSV file.
 unixitmes:np.ndarray- unixtime seqeuence (milli-scond precision).
 
Returns
pd.DataFrame- -
 
Expand source code
def load_and_resample_annotation( path: Path, unixtimes_ms: np.ndarray, classes: ActSet, label_col: str = "id", ) -> pd.DataFrame: """Load annotation data and resample them according to unixtime sequence ``T``. If there are no annotation records for the given timestamp, that records is treated as NULL class. Args: path (Path): path to annotation CSV file. unixitmes (np.ndarray): unixtime seqeuence (milli-scond precision). Returns: pd.DataFrame: - """ null_class_id = classes.get_ignore_class_id() if isinstance(null_class_id, tuple): null_class_id = null_class_id[-1] df = pd.read_csv(path) logger.debug(f"load annotation data from {path} -> df={df.shape}") ut_min, ut_max = df["unixtime"].min(), df["unixtime"].max() null_record = df.head(1).copy() null_record["unixtime"] = 0 null_record["box"] = 0 null_record[label_col] = null_class_id df = pd.concat([df, null_record], axis=0, ignore_index=True) # unixtime with second precision. unixtimes_sec = unixtimes_ms - (unixtimes_ms % 1000) # Assing 0 to non-annotated sequence. unixtimes_sec[unixtimes_sec < ut_min] = 0 unixtimes_sec[unixtimes_sec > ut_max] = 0 df = df.rename(columns={"unixtime": "annot_time"}).set_index("annot_time") df = df.loc[unixtimes_sec, :].reset_index(drop=False) df["unixtime"] = unixtimes_ms df["act_id"] = df[label_col] df["act_idx"] = classes.convert_id_to_index(df["act_id"].values) cols = ["unixtime", "annot_time", "user", "session", "box", "act_id", "act_idx"] return df[cols] def load_and_resample_operation_labels(path: pathlib.Path, unixtimes_ms: numpy.ndarray, classes: ActSet) ‑> pandas.core.frame.DataFrame- 
Expand source code
def load_and_resample_operation_labels( path: Path, unixtimes_ms: np.ndarray, classes: ActSet, ) -> pd.DataFrame: return load_and_resample_annotation(path, unixtimes_ms, classes, label_col="id") def load_annotation_csv(path: pathlib.Path) ‑> pandas.core.frame.DataFrame- 
Load ground truth label CSV and convert start/end timestamp into unix time (millisecond precision).
Expand source code
def load_annotation_csv(path: Path) -> pd.DataFrame: """Load ground truth label CSV and convert start/end timestamp into unix time (millisecond precision).""" df = pd.read_csv(path) # Timestamps in annotation data are saved in ISO format (e.g., 2021-10-14 11:25:35.437000+09:00) for human # readability. So we have to convert them into unixtimestamp (milli-second precision) in advance. df[START_UNIX_TIME_KEY_NAME] = df[START_ISO_TIMESTMAP_KEY_NAME].apply( convert_iso_timestamp_to_unixttime ) df[END_UNIX_TIME_KEY_NAME] = df[END_ISO_TIMESTMAP_KEY_NAME].apply( convert_iso_timestamp_to_unixttime ) logger.info(f"Load annotation data from {path}") return df