Newer
Older
ivus-complication-annotation-tool / tests / test_utils.py
@keishi85 keishi85 on 16 Dec 12 KB Testを実装
# -*- coding: utf-8 -*-
"""
Test suite for annotation saving utilities.
Tests data persistence, CSV encoding, and append behavior.
"""

import os
import csv
import tempfile
import shutil
import pytest
import pandas as pd
from pathlib import Path

# Import functions from utils module
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))

from utils.annotation_saver import (
    initialize_csv,
    save_annotation,
    get_annotated_cases
)


@pytest.fixture
def setup_teardown():
    """
    Create a temporary directory for each test and clean up afterward.

    Yields:
        str: Path to temporary directory
    """
    # Create temporary directory
    temp_dir = tempfile.mkdtemp()

    yield temp_dir

    # Cleanup: Remove temporary directory after test
    shutil.rmtree(temp_dir)


def test_new_file_creation(setup_teardown):
    """
    Test 1: CSV file creation with correct headers.

    Verifies that:
    - CSV file is created when it doesn't exist
    - Header row is correctly written
    - First data row is correctly saved
    """
    temp_dir = setup_teardown
    csv_path = os.path.join(temp_dir, "annotations_test.csv")

    # Verify file doesn't exist yet
    assert not os.path.exists(csv_path), "CSV file should not exist before saving"

    # Save first annotation
    save_annotation(
        csv_path=csv_path,
        case_id=134,
        prediction="あり",
        confidence=75,
        reasons=["石灰化プラークが多い", "減衰プラークが多い"],
        comment="明確な所見あり",
        annotator="田中",
        ground_truth=True
    )

    # Verify file was created
    assert os.path.exists(csv_path), "CSV file should exist after saving"

    # Read and verify contents
    df = pd.read_csv(csv_path, encoding='utf-8-sig')

    # Check header columns
    expected_columns = [
        'timestamp', 'case_id', 'prediction', 'confidence',
        'reasons', 'comment', 'annotator', 'ground_truth'
    ]
    assert list(df.columns) == expected_columns, "CSV headers don't match expected"

    # Verify data row
    assert len(df) == 1, "Should have exactly 1 data row"
    assert df.loc[0, 'case_id'] == 134, "Case ID mismatch"
    assert df.loc[0, 'prediction'] == "あり", "Prediction mismatch"
    assert df.loc[0, 'confidence'] == 75, "Confidence mismatch"
    assert df.loc[0, 'reasons'] == "石灰化プラークが多い; 減衰プラークが多い", "Reasons format incorrect"
    assert df.loc[0, 'comment'] == "明確な所見あり", "Comment mismatch"
    assert df.loc[0, 'annotator'] == "田中", "Annotator mismatch"
    assert df.loc[0, 'ground_truth'] == True, "Ground truth mismatch"
    assert 'timestamp' in df.columns and pd.notna(df.loc[0, 'timestamp']), "Timestamp missing"


def test_append_mode(setup_teardown):
    """
    Test 2: Append functionality - ensure data is not overwritten.

    Verifies that:
    - Second annotation is appended (not overwriting first)
    - Both rows exist in correct order
    - No data loss occurs
    """
    temp_dir = setup_teardown
    csv_path = os.path.join(temp_dir, "annotations_append_test.csv")

    # Save first annotation
    save_annotation(
        csv_path=csv_path,
        case_id=134,
        prediction="あり",
        confidence=75,
        reasons=["石灰化プラークが多い"],
        comment="First annotation",
        annotator="田中",
        ground_truth=True
    )

    # Save second annotation
    save_annotation(
        csv_path=csv_path,
        case_id=135,
        prediction="なし",
        confidence=50,
        reasons=["石灰化プラークが少ない"],
        comment="Second annotation",
        annotator="田中",
        ground_truth=False
    )

    # Read and verify both rows exist
    df = pd.read_csv(csv_path, encoding='utf-8-sig')

    assert len(df) == 2, "Should have exactly 2 data rows after append"

    # Verify first row (should not be overwritten)
    assert df.loc[0, 'case_id'] == 134, "First row case ID should remain"
    assert df.loc[0, 'prediction'] == "あり", "First row prediction should remain"
    assert df.loc[0, 'comment'] == "First annotation", "First row comment should remain"

    # Verify second row
    assert df.loc[1, 'case_id'] == 135, "Second row case ID incorrect"
    assert df.loc[1, 'prediction'] == "なし", "Second row prediction incorrect"
    assert df.loc[1, 'comment'] == "Second annotation", "Second row comment incorrect"
    assert df.loc[1, 'ground_truth'] == False, "Second row ground truth incorrect"


def test_japanese_encoding(setup_teardown):
    """
    Test 3: UTF-8-sig encoding with Japanese characters.

    Verifies that:
    - Japanese text is saved without corruption
    - utf-8-sig encoding prevents BOM issues in Excel
    - Data can be read back correctly
    """
    temp_dir = setup_teardown
    csv_path = os.path.join(temp_dir, "annotations_japanese_test.csv")

    # Japanese test data
    japanese_name = "田中"
    japanese_reasons = ["石灰化プラークが多い", "減衰プラークが少ない"]
    japanese_comment = "非常に明確な石灰化所見があり、合併症リスクが高いと判断しました。"

    # Save annotation with Japanese text
    save_annotation(
        csv_path=csv_path,
        case_id=200,
        prediction="あり",
        confidence=100,
        reasons=japanese_reasons,
        comment=japanese_comment,
        annotator=japanese_name,
        ground_truth=None  # Test None ground truth
    )

    # Read back using utf-8-sig encoding (same as save)
    df = pd.read_csv(csv_path, encoding='utf-8-sig')

    # Verify Japanese text is intact
    assert df.loc[0, 'annotator'] == japanese_name, f"Japanese name corrupted: expected {japanese_name}, got {df.loc[0, 'annotator']}"
    assert df.loc[0, 'prediction'] == "あり", "Japanese prediction corrupted"

    # Verify reasons (semicolon-separated Japanese text)
    expected_reasons = "; ".join(japanese_reasons)
    assert df.loc[0, 'reasons'] == expected_reasons, f"Japanese reasons corrupted: expected {expected_reasons}, got {df.loc[0, 'reasons']}"

    # Verify comment
    assert df.loc[0, 'comment'] == japanese_comment, "Japanese comment corrupted"

    # Verify ground_truth is empty or NaN for None
    gt_val = df.loc[0, 'ground_truth']
    assert pd.isna(gt_val) or gt_val == "", f"Ground truth should be empty or NaN for None, got {gt_val}"

    # Additional check: verify file can be opened with standard csv reader
    with open(csv_path, 'r', encoding='utf-8-sig') as f:
        reader = csv.DictReader(f)
        rows = list(reader)
        assert len(rows) == 1, "Should have 1 row"
        assert rows[0]['annotator'] == japanese_name, "CSV reader can't read Japanese correctly"


def test_decimal_case_ids(setup_teardown):
    """
    Test 4: Handling decimal case IDs (e.g., 134.1, 134.2).

    Verifies that:
    - Decimal case IDs are stored correctly
    - get_annotated_cases returns correct set of IDs
    """
    temp_dir = setup_teardown
    csv_path = os.path.join(temp_dir, "annotations_decimal_test.csv")

    # Save annotations with both integer and decimal case IDs
    save_annotation(
        csv_path=csv_path,
        case_id=134,
        prediction="あり",
        confidence=75,
        reasons=["石灰化プラークが多い"],
        comment="Integer case ID",
        annotator="田中",
        ground_truth=True
    )

    save_annotation(
        csv_path=csv_path,
        case_id=134.1,
        prediction="なし",
        confidence=50,
        reasons=["石灰化プラークが少ない"],
        comment="Decimal case ID 1",
        annotator="田中",
        ground_truth=False
    )

    save_annotation(
        csv_path=csv_path,
        case_id=134.2,
        prediction="あり",
        confidence=80,
        reasons=["減衰プラークが多い"],
        comment="Decimal case ID 2",
        annotator="田中",
        ground_truth=True
    )

    # Verify all case IDs are correctly stored
    df = pd.read_csv(csv_path, encoding='utf-8-sig')
    assert len(df) == 3, "Should have 3 rows"

    # Check case IDs
    case_ids = df['case_id'].tolist()
    assert 134 in case_ids or 134.0 in case_ids, "Integer case ID 134 not found"
    assert 134.1 in case_ids, "Decimal case ID 134.1 not found"
    assert 134.2 in case_ids, "Decimal case ID 134.2 not found"


def test_get_annotated_cases(setup_teardown):
    """
    Test 5: get_annotated_cases function.

    Verifies that:
    - Function returns correct set of annotated case IDs
    - Returns empty set for non-existent file
    """
    temp_dir = setup_teardown
    csv_path = os.path.join(temp_dir, "annotations_test.csv")

    # Save annotations
    save_annotation(
        csv_path=csv_path,
        case_id=134,
        prediction="あり",
        confidence=75,
        reasons=["石灰化プラークが多い"],
        comment="Case 134",
        annotator="田中",
        ground_truth=True
    )

    save_annotation(
        csv_path=csv_path,
        case_id=135,
        prediction="なし",
        confidence=50,
        reasons=["石灰化プラークが少ない"],
        comment="Case 135",
        annotator="田中",
        ground_truth=False
    )

    save_annotation(
        csv_path=csv_path,
        case_id=136,
        prediction="あり",
        confidence=80,
        reasons=["減衰プラークが多い"],
        comment="Case 136",
        annotator="田中",
        ground_truth=True
    )

    # Test: Get all annotated cases
    all_cases = get_annotated_cases(csv_path)
    assert len(all_cases) == 3, f"Should have 3 unique cases, got {len(all_cases)}"
    assert 134 in all_cases, "Case 134 should be in annotated cases"
    assert 135 in all_cases, "Case 135 should be in annotated cases"
    assert 136 in all_cases, "Case 136 should be in annotated cases"

    # Test: Filter by annotator
    tanaka_cases = get_annotated_cases(csv_path, annotator="田中")
    assert len(tanaka_cases) == 3, f"田中 should have 3 cases, got {len(tanaka_cases)}"

    # Test: Non-existent file returns empty set
    non_existent_path = os.path.join(temp_dir, "non_existent.csv")
    empty_cases = get_annotated_cases(non_existent_path)
    assert len(empty_cases) == 0, "Non-existent file should return empty set"
    assert isinstance(empty_cases, set), "Should return a set type"


def test_initialize_csv_idempotent(setup_teardown):
    """
    Test 6: initialize_csv is idempotent (safe to call multiple times).

    Verifies that:
    - Calling initialize_csv multiple times doesn't corrupt file
    - Existing data is preserved
    """
    temp_dir = setup_teardown
    csv_path = os.path.join(temp_dir, "annotations_idempotent_test.csv")

    # First initialization
    initialize_csv(csv_path)
    assert os.path.exists(csv_path), "File should be created"

    # Add some data
    save_annotation(
        csv_path=csv_path,
        case_id=100,
        prediction="あり",
        confidence=75,
        reasons=["石灰化プラークが多い"],
        comment="Test data",
        annotator="田中",
        ground_truth=True
    )

    # Call initialize_csv again (should not corrupt file)
    initialize_csv(csv_path)

    # Verify data is still intact
    df = pd.read_csv(csv_path, encoding='utf-8-sig')
    assert len(df) == 1, "Data should be preserved after re-initialization"
    assert df.loc[0, 'case_id'] == 100, "Case ID should be preserved"
    assert df.loc[0, 'annotator'] == "田中", "Annotator should be preserved"


def test_empty_reasons_list(setup_teardown):
    """
    Test 7: Handling empty reasons list.

    Verifies that:
    - Empty reasons list is handled gracefully
    - Results in empty string in CSV
    """
    temp_dir = setup_teardown
    csv_path = os.path.join(temp_dir, "annotations_empty_reasons.csv")

    # Save annotation with empty reasons
    save_annotation(
        csv_path=csv_path,
        case_id=200,
        prediction="なし",
        confidence=50,
        reasons=[],  # Empty list
        comment="No specific reasons",
        annotator="田中",
        ground_truth=False
    )

    # Read and verify
    df = pd.read_csv(csv_path, encoding='utf-8-sig')
    assert len(df) == 1, "Should have 1 row"

    # Empty reasons should result in empty string
    reasons_value = df.loc[0, 'reasons']
    assert reasons_value == "" or pd.isna(reasons_value), f"Empty reasons should be empty string, got: {reasons_value}"