Demo-Maker/main.py at 40020aa73282babe1873cb99d7d419eb35622b7b

Fork: 0
mikado / Demo-Maker
Find file
Newer
Older
Demo-Maker / main.py
mikado-4410 on 27 Jan 2025 48 KB [update]SSDも計測できるように修正
Raw Blame History
import argparse
import csv
import os
import pickle
import re
import time
from threading import Thread

import cv2
import numpy as np
import pandas as pd
from mmdet.apis import DetInferencer, inference_detector, init_detector

# RTMpose
from mmpose.apis import inference_topdown
from mmpose.apis import init_model as init_pose_estimator
from mmpose.evaluation.functional import nms
from mmpose.registry import VISUALIZERS
from mmpose.structures import merge_data_samples
from mmpose.utils import adapt_mmdet_pipeline

# Pillow
from PIL import Image, ImageDraw

import config

# EARSNet
from modules.EARSNet.predictor import EARSNetPredictor

# Utilities
from util.calc_ste_position import CalcStethoscopePosition
from util.ears_ai import EarsAI

###############################################################################
# Config 値を参照
###############################################################################
CONV_COLOR = config.CONV_COLOR
XGBOOST_COLOR = config.XGBOOST_COLOR
LIGHTGBM_COLOR = config.LIGHTGBM_COLOR
EARSNET_COLOR = config.EARSNET_COLOR
CATBOOST_COLOR = config.CATBOOST_COLOR
NGBOOST_COLOR = config.NGBOOST_COLOR

CONV_ENABLED = config.CONV_ENABLED
XGBOOST_ENABLED = config.XGBOOST_ENABLED
LIGHTGBM_ENABLED = config.LIGHTGBM_ENABLED
CATBOOST_ENABLED = config.CATBOOST_ENABLED
NGBOOST_ENABLED = config.NGBOOST_ENABLED
POSENET_ENABLED = config.POSENET_ENABLED
RTMPOSE_ENABLED = config.RTMPOSE_ENABLED
MOBILENETV1SSD_ENABLED = config.MOBILENETV1SSD_ENABLED  # ここを True にするとSSD使う
YOLOX_ENABLED = config.YOLOX_ENABLED
EARSNET_ENABLED = config.EARSNET_ENABLED
EARSNET_CROP_ENABLED = config.EARSNET_CROP_ENABLED

NORMALIZE_ENABLED = config.NORMALIZE_ENABLED
DEVICE = config.DEVICE  # "cuda" or "cpu" など

###############################################################################
# リアルタイムFPS計測用のグローバル変数＆スレッド定義 (描画時間は含まない)
###############################################################################
processed_frames = 0  # 推論処理が完了したフレーム数(メインスレッドでインクリメント)
stop_fps_thread = False  # スレッド終了フラグ
fps_history = []


###############################################################################
# FPS監視スレッド
###############################################################################
def fps_monitor(interval=1.0):
    """推論処理完了したフレーム数を定期的に見てFPSを算出する。描画時間は含まない。"""
    global processed_frames, stop_fps_thread, fps_history

    last_count = 0
    last_time = time.time()

    while not stop_fps_thread:
        time.sleep(interval)
        now = time.time()

        current_count = processed_frames
        frames_delta = current_count - last_count
        time_delta = now - last_time

        if time_delta > 0:
            current_fps = frames_delta / time_delta
        else:
            current_fps = 0.0

        print(
            f"[FPS Monitor] Real-time FPS: {current_fps:.2f}  (frames: +{frames_delta})"
        )
        fps_history.append((now, current_fps))

        last_count = current_count
        last_time = now


###############################################################################
# モデルロード系 (LightGBM/XGBoost など)
###############################################################################
def load_model(model_path, model_type="lgb"):
    with open(model_path, "rb") as model_file:
        return pickle.load(model_file)


def load_scaler(scaler_path):
    with open(scaler_path, "rb") as f:
        return pickle.load(f)


###############################################################################
# YOLOX 初期化
###############################################################################
def init_yolox():
    try:
        from mmengine.registry import DefaultScope

        DefaultScope.get_instance("mmdet", scope_name="mmdet")

        init_args = {
            "model": config.YOLOX_CONFIG_FILE,
            "weights": config.YOLOX_CHECKPOINT_FILE,
            "device": DEVICE,
        }
        yolox_inferencer = DetInferencer(**init_args)
        return yolox_inferencer
    except Exception as e:
        print(f"Error initializing YOLOX: {str(e)}")
        return None


###############################################################################
# SSD検出 (MobileNetV1 SSD)
###############################################################################
def ssd_detector_inference(frame, ears_ai):
    """
    MobileNetV1 SSD 用の推論。
    ears_ai.ssd_detect(frame, None) → (overlay_img, x, y)
    overlay_img: 推論の可視化結果
    x, y: 聴診器中心座標 (未検出時は 0, 0)
    """
    stethoscope_overlay_img, stethoscope_x, stethoscope_y = ears_ai.ssd_detect(
        frame, None
    )
    return stethoscope_overlay_img, stethoscope_x, stethoscope_y


###############################################################################
# Pillow-based drawing helpers
###############################################################################
def pillow_draw_circle(draw, center, radius, fill=None, outline=None, width=1):
    x, y = int(center[0]), int(center[1])
    left_up = (x - radius, y - radius)
    right_down = (x + radius, y + radius)
    draw.ellipse([left_up, right_down], fill=fill, outline=outline, width=width)


def draw_glow_marker(draw, center, main_color, radius=5):
    outer_radius = radius + 3
    x, y = int(center[0]), int(center[1])
    # 白枠
    pillow_draw_circle(
        draw, (x, y), outer_radius, fill=None, outline=(255, 255, 255), width=2
    )
    # 中心塗りつぶし
    pillow_draw_circle(draw, (x, y), radius, fill=main_color)


def pillow_draw_polygon(draw, vertices, outline=(0, 255, 0), width=2):
    int_vertices = [(int(v[0]), int(v[1])) for v in vertices]
    if len(int_vertices) > 1:
        for i in range(len(int_vertices)):
            j = (i + 1) % len(int_vertices)
            draw.line([int_vertices[i], int_vertices[j]], fill=outline, width=width)


def pillow_draw_polyline(draw, points, color=(255, 0, 0), width=2):
    if len(points) < 2:
        return
    int_points = [(int(p[0]), int(p[1])) for p in points]
    for i in range(len(int_points) - 1):
        draw.line([int_points[i], int_points[i + 1]], fill=color, width=width)


def draw_polygon_and_detection_pillow(
    image, polygon_vertices, stethoscope_x, stethoscope_y
):
    """Draw polygon & stethoscope location with Pillow, then return BGR np.array."""
    pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_img)

    vertices = [(int(v[0]), int(v[1])) for v in polygon_vertices]
    pillow_draw_polygon(draw, vertices, outline=(0, 255, 0), width=2)

    if stethoscope_x is not None and stethoscope_y is not None:
        x, y = int(stethoscope_x), int(stethoscope_y)
        # 光彩付きマーカー
        draw_glow_marker(draw, (x, y), main_color=(255, 0, 0), radius=8)

    out_img_rgb = np.array(pil_img)
    out_img_bgr = cv2.cvtColor(out_img_rgb, cv2.COLOR_RGB2BGR)
    return out_img_bgr


def yolox_detector_inference(frame, yolox_inferencer, pose_keypoints, score_thr=0.3):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = yolox_inferencer(inputs=frame_rgb, return_vis=True)
    predictions = result["predictions"][0]
    stethoscope_x = None
    stethoscope_y = None
    max_score = -1

    nose = pose_keypoints[0]
    left_shoulder = pose_keypoints[5]
    right_shoulder = pose_keypoints[6]
    left_hip = pose_keypoints[11]
    right_hip = pose_keypoints[12]

    expanded_left_shoulder, expanded_right_shoulder = expand_points(
        left_shoulder, right_shoulder
    )
    expanded_left_hip, expanded_right_hip = expand_points(left_hip, right_hip)

    polygon_vertices = np.array(
        [
            nose,
            expanded_left_shoulder,
            expanded_left_hip,
            expanded_right_hip,
            expanded_right_shoulder,
        ]
    )

    for i, (label, score) in enumerate(
        zip(predictions["labels"], predictions["scores"])
    ):
        if score >= score_thr and label == 0:
            bbox = predictions["bboxes"][i]
            center_x = (bbox[0] + bbox[2]) / 2
            center_y = (bbox[1] + bbox[3]) / 2

            if point_in_polygon([center_x, center_y], polygon_vertices):
                if score > max_score:
                    stethoscope_x = center_x
                    stethoscope_y = center_y
                    max_score = score

    if stethoscope_x is None or stethoscope_y is None:
        stethoscope_x = 0
        stethoscope_y = 0

    stethoscope_overlay_img = result["visualization"][0]
    if (
        len(stethoscope_overlay_img.shape) == 3
        and stethoscope_overlay_img.shape[2] == 3
    ):
        stethoscope_overlay_img = cv2.cvtColor(
            stethoscope_overlay_img, cv2.COLOR_RGB2BGR
        )

    stethoscope_overlay_img = draw_polygon_and_detection_pillow(
        stethoscope_overlay_img, polygon_vertices, stethoscope_x, stethoscope_y
    )
    return stethoscope_overlay_img, stethoscope_x, stethoscope_y


def expand_points(p1, p2):
    mid_x = (p1[0] + p2[0]) / 2
    mid_y = (p1[1] + p2[1]) / 2
    vec_x = p1[0] - mid_x
    vec_y = p1[1] - mid_y
    new_p1 = [mid_x + vec_x * 2, mid_y + vec_y * 2]
    new_p2 = [mid_x - vec_x * 2, mid_y - vec_y * 2]
    return np.array(new_p1), np.array(new_p2)


def point_in_polygon(point, vertices):
    x, y = point
    n = len(vertices)
    inside = False
    j = n - 1
    for i in range(n):
        if (vertices[i][1] > y) != (vertices[j][1] > y):
            slope = (vertices[j][0] - vertices[i][0]) / (
                vertices[j][1] - vertices[i][1]
            )
            intersect_x = slope * (y - vertices[i][1]) + vertices[i][0]
            if x < intersect_x:
                inside = not inside
        j = i
    return inside


###############################################################################
# 各種座標変換
###############################################################################
def normalize_quadrilateral_with_point(points, extra_point):
    all_points = np.vstack([points.reshape(-1, 2), extra_point])
    center = np.mean(points.reshape(-1, 2), axis=0)
    centered_points = all_points - center

    shoulder_angle = calculate_rotation_angle(centered_points[0], centered_points[1])
    hip_angle = calculate_rotation_angle(centered_points[2], centered_points[3])
    average_angle = (shoulder_angle + hip_angle) / 2

    rotation_matrix = np.array(
        [
            [np.cos(-average_angle), -np.sin(-average_angle)],
            [np.sin(-average_angle), np.cos(-average_angle)],
        ]
    )

    rotated_points = np.dot(centered_points, rotation_matrix.T)
    max_edge_length = np.max(
        np.linalg.norm(
            np.roll(rotated_points[:4], -1, axis=0) - rotated_points[:4], axis=1
        )
    )
    if max_edge_length == 0:
        return rotated_points
    return rotated_points / max_edge_length


def calculate_rotation_angle(point1, point2):
    vector = point2 - point1
    return np.arctan2(vector[1], vector[0])


###############################################################################
# 動画→フレーム
###############################################################################
def video_to_frames(video_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    video = cv2.VideoCapture(video_path)
    if not video.isOpened():
        raise IOError(f"Could not open video file: {video_path}")

    frame_num = 0
    while True:
        success, frame = video.read()
        if not success:
            break
        frame_num += 1
        cv2.imwrite(os.path.join(output_dir, f"{frame_num}-frame.png"), frame)

    video.release()
    print(f"All frames saved to {output_dir}")


###############################################################################
# RTMpose キーポイント抽出
###############################################################################
def extract_keypoints_rtmpose(pose_results):
    if not pose_results:
        print("No pose results found.")
        return None

    max_avg_visible = 0
    best_instance = None
    for result in pose_results:
        pred_instances = result.pred_instances
        for instance in pred_instances:
            avg_visible = np.mean(instance.keypoints_visible)
            if avg_visible > max_avg_visible:
                max_avg_visible = avg_visible
                best_instance = instance

    if best_instance is None:
        print("No valid instances found.")
        return None

    keypoints = best_instance.keypoints[0]
    return keypoints


###############################################################################
# 胴体クロップ生成
###############################################################################
def crop_body_from_keypoints(frame, left_shoulder, right_shoulder, left_hip, right_hip):
    h, w, _ = frame.shape
    xs = [left_shoulder[0], right_shoulder[0], left_hip[0], right_hip[0]]
    ys = [left_shoulder[1], right_shoulder[1], left_hip[1], right_hip[1]]

    xmin = int(min(xs))
    xmax = int(max(xs))
    ymin = int(min(ys))
    ymax = int(max(ys))

    margin = 20
    xmin = max(0, xmin - margin)
    xmax = min(w, xmax + margin)
    ymin = max(0, ymin - margin)
    ymax = min(h, ymax + margin)

    cropped_frame = frame[ymin:ymax, xmin:xmax].copy()
    return cropped_frame, (xmin, ymin)


###############################################################################
# メイン処理 (推論 & 座標計算)
###############################################################################
def process_images(args, detector, pose_estimator, visualizer):
    global processed_frames
    ears_ai = EarsAI()
    calc_position = CalcStethoscopePosition()

    base_dir = os.path.join(args.output_dir, "frames")
    results_dir = os.path.join(args.output_dir, "results")
    csv_path = os.path.join(results_dir, "results.csv")
    normalized_csv_path = os.path.join(results_dir, "results-convert.csv")
    pose_overlay_dir = os.path.join(results_dir, "pose_overlay_image")
    stethoscope_overlay_dir = os.path.join(results_dir, "stethoscope_overlay_image")

    cropped_dir = os.path.join(results_dir, "cropped_images")
    os.makedirs(results_dir, exist_ok=True)
    os.makedirs(pose_overlay_dir, exist_ok=True)
    os.makedirs(stethoscope_overlay_dir, exist_ok=True)
    os.makedirs(cropped_dir, exist_ok=True)

    png_files = sorted(
        [f for f in os.listdir(base_dir) if f.lower().endswith(".png")],
        key=lambda x: int(re.search(r"(\d+)", x).group(1)),
    )
    print(f"Found {len(png_files)} PNG files in {base_dir}.")

    rows = []
    normalized_rows = []

    # YOLOX 初期化
    yolox_inferencer = None
    if YOLOX_ENABLED:
        yolox_inferencer = init_yolox()

    # 時間計測用 dict
    timings = {
        "rtmpose_single": [],
        "yolox_single": [],
        "ssd_single": [],  # SSD 用 追加
        "conv_single": [],
        "lightgbm_single": [],
        "xgboost_single": [],
        "earsnet_single": [],
        "earsnet_cropped_single": [],
        "pipeline_rtmpose_yolox_conv": [],
        "pipeline_rtmpose_yolox_lightgbm": [],
        "pipeline_rtmpose_yolox_xgboost": [],
        "pipeline_rtmpose_ssd_conv": [],  # SSD 用 追加
        "pipeline_rtmpose_ssd_lightgbm": [],  # SSD 用 追加
        "pipeline_rtmpose_ssd_xgboost": [],  # SSD 用 追加
        "pipeline_earsnet": [],
        "pipeline_earsnet_cropped": [],
    }

    # モデルの事前ロード (LightGBM/XGBoost など)
    if LIGHTGBM_ENABLED:
        lgb_model_x = load_model("./models/LightGBM/stethoscope_calc_x_best_model.pkl")
        lgb_model_y = load_model("./models/LightGBM/stethoscope_calc_y_best_model.pkl")
        lgb_scaler_x = load_scaler("./models/LightGBM/scaler-x.pkl")
        lgb_scaler_y = load_scaler("./models/LightGBM/scaler-y.pkl")

    if XGBOOST_ENABLED:
        xg_model_x = load_model("./models/XGBoost/stethoscope_calc_x_best_model.pkl")
        xg_model_y = load_model("./models/XGBoost/stethoscope_calc_y_best_model.pkl")
        xg_scaler_x = load_scaler("./models/XGBoost/scaler-x.pkl")
        xg_scaler_y = load_scaler("./models/XGBoost/scaler-y.pkl")

    if CATBOOST_ENABLED:
        catboost_model_x = load_model(
            "./models/CatBoost/stethoscope_calc_x_best_model.pkl"
        )
        catboost_model_y = load_model(
            "./models/CatBoost/stethoscope_calc_y_best_model.pkl"
        )

    if NGBOOST_ENABLED:
        ngboost_model_x = load_model(
            "./models/NGBoost/stethoscope_calc_x_best_model.pkl"
        )
        ngboost_model_y = load_model(
            "./models/NGBoost/stethoscope_calc_y_best_model.pkl"
        )

    if EARSNET_ENABLED:
        earsnet_predictor = EARSNetPredictor(
            weight_path="models/EARSNet/best_model.pth",
            resnet_depth="18",
            pretrained=True,
            device=DEVICE,
        )

    if EARSNET_CROP_ENABLED:
        earsnet_cropped_predictor = EARSNetPredictor(
            weight_path="models/EARSNet/crop/best_model.pth",
            resnet_depth="18",
            pretrained=True,
            device=DEVICE,
        )

    input_columns = [
        "left_shoulder_x",
        "left_shoulder_y",
        "right_shoulder_x",
        "right_shoulder_y",
        "left_hip_x",
        "left_hip_y",
        "right_hip_x",
        "right_hip_y",
        "stethoscope_x",
        "stethoscope_y",
    ]

    # メインループ
    for image_file_name in png_files:
        image_path = os.path.join(base_dir, image_file_name)
        frame = cv2.imread(image_path)
        if frame is None:
            print(f"Failed to load image: {image_path}")
            continue

        # ============= (A) RTMpose or PoseNet で人体キーポイント =============
        rtmpose_time = 0.0
        pose_keypoints = None

        if RTMPOSE_ENABLED:
            start_time_rtmpose = time.time()
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            det_result = inference_detector(detector, frame_rgb)
            pred_instance = det_result.pred_instances.cpu().numpy()

            bboxes = np.concatenate(
                (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1
            )
            bboxes = bboxes[
                np.logical_and(pred_instance.labels == 0, pred_instance.scores > 0.3)
            ]
            bboxes = bboxes[nms(bboxes, 0.3), :4]

            pose_results = inference_topdown(pose_estimator, frame_rgb, bboxes)
            data_samples = merge_data_samples(pose_results)
            pose_keypoints = extract_keypoints_rtmpose(pose_results)

            end_time_rtmpose = time.time()
            rtmpose_time = end_time_rtmpose - start_time_rtmpose
            timings["rtmpose_single"].append(rtmpose_time)

            if pose_keypoints is None:
                print(f"Failed to extract keypoints for image: {image_path}")
                processed_frames += 1
                continue

            # 可視化
            if visualizer is not None:
                visualizer.add_datasample(
                    "result",
                    frame_rgb,
                    data_sample=data_samples,
                    draw_gt=False,
                    draw_heatmap=False,
                    draw_bbox=False,
                    show_kpt_idx=False,
                    skeleton_style="mmpose",
                    show=False,
                    wait_time=0,
                    kpt_thr=0.3,
                )
            pose_overlay_img = visualizer.get_image()
            pose_overlay_bgr = cv2.cvtColor(pose_overlay_img, cv2.COLOR_RGB2BGR)
            cv2.imwrite(
                os.path.join(pose_overlay_dir, image_file_name), pose_overlay_bgr
            )

            left_shoulder = (pose_keypoints[5][0], pose_keypoints[5][1])
            right_shoulder = (pose_keypoints[6][0], pose_keypoints[6][1])
            left_hip = (pose_keypoints[11][0], pose_keypoints[11][1])
            right_hip = (pose_keypoints[12][0], pose_keypoints[12][1])

        elif POSENET_ENABLED:
            start_time_rtmpose = time.time()
            pose_overlay_img, *landmarks = ears_ai.pose_detect(frame, None)
            end_time_rtmpose = time.time()
            rtmpose_time = end_time_rtmpose - start_time_rtmpose
            timings["rtmpose_single"].append(rtmpose_time)

            # landmarks = [right_shoulder, left_shoulder, right_hip, left_hip] (例)
            # ここは利用環境に合わせて入れ替えてください
            left_shoulder = (landmarks[0][1], landmarks[0][0])
            right_shoulder = (landmarks[1][1], landmarks[1][0])
            left_hip = (landmarks[2][1], landmarks[2][0])
            right_hip = (landmarks[3][1], landmarks[3][0])

            cv2.imwrite(
                os.path.join(pose_overlay_dir, image_file_name), pose_overlay_img
            )

        else:
            # RTMPose/PoseNetどちらも無効 → 0固定
            left_shoulder = (0, 0)
            right_shoulder = (0, 0)
            left_hip = (0, 0)
            right_hip = (0, 0)

        # ============= (B) 聴診器検出: YOLOX or SSD =============
        stethoscope_x, stethoscope_y = 0, 0
        detection_time = 0.0

        # 1) YOLOX
        if YOLOX_ENABLED:
            if pose_keypoints is not None:
                start_t = time.time()
                from_yolox_img, stethoscope_x, stethoscope_y = yolox_detector_inference(
                    frame, yolox_inferencer, pose_keypoints
                )
                end_t = time.time()
                detection_time = end_t - start_t
                timings["yolox_single"].append(detection_time)

                # 可視化
                cv2.imwrite(
                    os.path.join(stethoscope_overlay_dir, image_file_name),
                    from_yolox_img,
                )

        # 2) SSD (MobileNetV1)
        elif MOBILENETV1SSD_ENABLED:
            start_t = time.time()
            from_ssd_img, stethoscope_x, stethoscope_y = ssd_detector_inference(
                frame, ears_ai
            )
            end_t = time.time()
            detection_time = end_t - start_t
            timings["ssd_single"].append(detection_time)

            # 可視化
            cv2.imwrite(
                os.path.join(stethoscope_overlay_dir, image_file_name),
                from_ssd_img,
            )

        # Pipeline で合計検出時間
        detection_time_rtmpose_detector = rtmpose_time + detection_time

        # ============= (C) EARSNet (単体) =============
        if EARSNET_ENABLED:
            start_time_earsnet = time.time()
            earsnet_x, earsnet_y = earsnet_predictor.predict(image_path)
            end_time_earsnet = time.time()
            earsnet_time = end_time_earsnet - start_time_earsnet
            timings["earsnet_single"].append(earsnet_time)
            timings["pipeline_earsnet"].append(earsnet_time)

        else:
            earsnet_x, earsnet_y = 0, 0

        # ============= (D) EARSNet (クロップ) =============
        if EARSNET_CROP_ENABLED:
            cropped_img, (crop_xmin, crop_ymin) = crop_body_from_keypoints(
                frame, left_shoulder, right_shoulder, left_hip, right_hip
            )
            cropped_filename = os.path.splitext(image_file_name)[0] + "_cropped.png"
            cv2.imwrite(os.path.join(cropped_dir, cropped_filename), cropped_img)

            start_time_earsnet_cropped = time.time()
            earsnet_cropped_x, earsnet_cropped_y = earsnet_cropped_predictor.predict(
                os.path.join(cropped_dir, cropped_filename)
            )
            end_time_earsnet_cropped = time.time()
            earsnet_cropped_time = end_time_earsnet_cropped - start_time_earsnet_cropped
            timings["earsnet_cropped_single"].append(earsnet_cropped_time)

            pipeline_earsnet_cropped_time = rtmpose_time + earsnet_cropped_time
            timings["pipeline_earsnet_cropped"].append(pipeline_earsnet_cropped_time)

        else:
            earsnet_cropped_x, earsnet_cropped_y = 0, 0

        # ============= (E) リザルト保存用 row を組み立て =============
        row = {
            "image_file_name": image_file_name,
            "left_shoulder_x": left_shoulder[0],
            "left_shoulder_y": left_shoulder[1],
            "right_shoulder_x": right_shoulder[0],
            "right_shoulder_y": right_shoulder[1],
            "left_hip_x": left_hip[0],
            "left_hip_y": left_hip[1],
            "right_hip_x": right_hip[0],
            "right_hip_y": right_hip[1],
            "stethoscope_x": stethoscope_x,
            "stethoscope_y": stethoscope_y,
            "earsnet_stethoscope_x": earsnet_x,
            "earsnet_stethoscope_y": earsnet_y,
            "earsnet_crop_stethoscope_x": earsnet_cropped_x,
            "earsnet_crop_stethoscope_y": earsnet_cropped_y,
        }

        # ============= (F) 正規化 =============
        source_points = np.array(
            [
                [float(row["left_shoulder_x"]), float(row["left_shoulder_y"])],
                [float(row["right_shoulder_x"]), float(row["right_shoulder_y"])],
                [float(row["left_hip_x"]), float(row["left_hip_y"])],
                [float(row["right_hip_x"]), float(row["right_hip_y"])],
            ],
            dtype=np.float32,
        )
        stethoscope_point = np.array([row["stethoscope_x"], row["stethoscope_y"]])
        normalized_points = normalize_quadrilateral_with_point(
            source_points.flatten(), stethoscope_point
        )

        normalized_row = {
            "image_file_name": image_file_name,
            "left_shoulder_x": normalized_points[0, 0],
            "left_shoulder_y": normalized_points[0, 1],
            "right_shoulder_x": normalized_points[1, 0],
            "right_shoulder_y": normalized_points[1, 1],
            "left_hip_x": normalized_points[2, 0],
            "left_hip_y": normalized_points[2, 1],
            "right_hip_x": normalized_points[3, 0],
            "right_hip_y": normalized_points[3, 1],
            "stethoscope_x": normalized_points[4, 0],
            "stethoscope_y": normalized_points[4, 1],
        }

        # EARSNet ノーマライズ (通常/クロップ)
        if EARSNET_ENABLED:
            p_earsnet = np.array(
                [row["earsnet_stethoscope_x"], row["earsnet_stethoscope_y"]]
            )
            norm_earsnet = normalize_quadrilateral_with_point(
                source_points.flatten(), p_earsnet
            )
            normalized_row["earsnet_stethoscope_x"] = norm_earsnet[4, 0]
            normalized_row["earsnet_stethoscope_y"] = norm_earsnet[4, 1]

        if EARSNET_CROP_ENABLED:
            p_earsnet_crop = np.array(
                [row["earsnet_crop_stethoscope_x"], row["earsnet_crop_stethoscope_y"]]
            )
            norm_earsnet_crop = normalize_quadrilateral_with_point(
                source_points.flatten(), p_earsnet_crop
            )
            normalized_row["earsnet_crop_stethoscope_x"] = norm_earsnet_crop[4, 0]
            normalized_row["earsnet_crop_stethoscope_y"] = norm_earsnet_crop[4, 1]

        # ============= (G) パイプライン (Conv/XGBoost/LightGBM) =============
        # YOLOX vs. SSD で結果保存先のキーが違うのでここで使う変数を決める
        # RTMPose + YOLOX or SSD
        if RTMPOSE_ENABLED:
            # YOLOX
            if YOLOX_ENABLED and pose_keypoints is not None:
                # conv
                if CONV_ENABLED:
                    start_conv = time.time()
                    sp = np.array(
                        [
                            [row["left_shoulder_x"], row["left_shoulder_y"]],
                            [row["right_shoulder_x"], row["right_shoulder_y"]],
                            [row["left_hip_x"], row["left_hip_y"]],
                            [row["right_hip_x"], row["right_hip_y"]],
                        ],
                        dtype=np.float32,
                    )
                    stp = np.array([row["stethoscope_x"], row["stethoscope_y"]])
                    _ = calc_position.calc_affine(sp, *stp)
                    end_conv = time.time()
                    conv_time = end_conv - start_conv
                    timings["conv_single"].append(conv_time)
                    timings["pipeline_rtmpose_yolox_conv"].append(
                        rtmpose_time + detection_time + conv_time
                    )

                # XGBoost
                if XGBOOST_ENABLED:
                    xg_start = time.time()
                    if NORMALIZE_ENABLED:
                        input_data_xg = pd.DataFrame([normalized_row])
                    else:
                        input_data_xg = pd.DataFrame([row])
                    X_scaled_x = xg_scaler_x.transform(input_data_xg[input_columns])
                    _ = xg_model_x.predict(X_scaled_x)[0]
                    X_scaled_y = xg_scaler_y.transform(input_data_xg[input_columns])
                    _ = xg_model_y.predict(X_scaled_y)[0]
                    xg_end = time.time()
                    xg_time = xg_end - xg_start
                    timings["xgboost_single"].append(xg_time)
                    timings["pipeline_rtmpose_yolox_xgboost"].append(
                        rtmpose_time + detection_time + xg_time
                    )

                # LightGBM
                if LIGHTGBM_ENABLED:
                    lgb_start = time.time()
                    if NORMALIZE_ENABLED:
                        input_data_lgb = pd.DataFrame([normalized_row])
                    else:
                        input_data_lgb = pd.DataFrame([row])
                    X_scaled_x = lgb_scaler_x.transform(input_data_lgb[input_columns])
                    _ = lgb_model_x.predict(X_scaled_x)[0]
                    X_scaled_y = lgb_scaler_y.transform(input_data_lgb[input_columns])
                    _ = lgb_model_y.predict(X_scaled_y)[0]
                    lgb_end = time.time()
                    lgb_time = lgb_end - lgb_start
                    timings["lightgbm_single"].append(lgb_time)
                    timings["pipeline_rtmpose_yolox_lightgbm"].append(
                        rtmpose_time + detection_time + lgb_time
                    )

            # SSD
            elif MOBILENETV1SSD_ENABLED:
                # conv
                if CONV_ENABLED:
                    start_conv = time.time()
                    sp = np.array(
                        [
                            [row["left_shoulder_x"], row["left_shoulder_y"]],
                            [row["right_shoulder_x"], row["right_shoulder_y"]],
                            [row["left_hip_x"], row["left_hip_y"]],
                            [row["right_hip_x"], row["right_hip_y"]],
                        ],
                        dtype=np.float32,
                    )
                    stp = np.array([row["stethoscope_x"], row["stethoscope_y"]])
                    _ = calc_position.calc_affine(sp, *stp)
                    end_conv = time.time()
                    conv_time = end_conv - start_conv
                    timings["conv_single"].append(conv_time)
                    timings["pipeline_rtmpose_ssd_conv"].append(
                        rtmpose_time + detection_time + conv_time
                    )

                # XGBoost
                if XGBOOST_ENABLED:
                    xg_start = time.time()
                    if NORMALIZE_ENABLED:
                        input_data_xg = pd.DataFrame([normalized_row])
                    else:
                        input_data_xg = pd.DataFrame([row])
                    X_scaled_x = xg_scaler_x.transform(input_data_xg[input_columns])
                    _ = xg_model_x.predict(X_scaled_x)[0]
                    X_scaled_y = xg_scaler_y.transform(input_data_xg[input_columns])
                    _ = xg_model_y.predict(X_scaled_y)[0]
                    xg_end = time.time()
                    xg_time = xg_end - xg_start
                    timings["xgboost_single"].append(xg_time)
                    timings["pipeline_rtmpose_ssd_xgboost"].append(
                        rtmpose_time + detection_time + xg_time
                    )

                # LightGBM
                if LIGHTGBM_ENABLED:
                    lgb_start = time.time()
                    if NORMALIZE_ENABLED:
                        input_data_lgb = pd.DataFrame([normalized_row])
                    else:
                        input_data_lgb = pd.DataFrame([row])
                    X_scaled_x = lgb_scaler_x.transform(input_data_lgb[input_columns])
                    _ = lgb_model_x.predict(X_scaled_x)[0]
                    X_scaled_y = lgb_scaler_y.transform(input_data_lgb[input_columns])
                    _ = lgb_model_y.predict(X_scaled_y)[0]
                    lgb_end = time.time()
                    lgb_time = lgb_end - lgb_start
                    timings["lightgbm_single"].append(lgb_time)
                    timings["pipeline_rtmpose_ssd_lightgbm"].append(
                        rtmpose_time + detection_time + lgb_time
                    )

        rows.append(row)
        # 正規化後データ
        normalized_rows.append(normalized_row)

        processed_frames += 1

    # ============= CSV書き込み =============
    if rows:
        fieldnames = list(rows[0].keys())
        csvfile_path = os.path.join(results_dir, "results.csv")
        normfile_path = os.path.join(results_dir, "results-convert.csv")

        with (
            open(csvfile_path, "w", newline="") as csvfile,
            open(normfile_path, "w", newline="") as norm_csvfile,
        ):
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            norm_fieldnames = list(normalized_rows[0].keys())
            norm_writer = csv.DictWriter(norm_csvfile, fieldnames=norm_fieldnames)
            norm_writer.writeheader()

            for row_, norm_row_ in zip(rows, normalized_rows):
                writer.writerow(row_)
                norm_writer.writerow(norm_row_)

        print(f"Processed and saved results to: {csvfile_path}")
        print(f"Processed and saved normalized results to: {normfile_path}")

        # 可視化動画化
        generate_visualizations(csvfile_path, base_dir, results_dir)
    else:
        print("No data to write to CSV.")

    # ============= FPS計測結果を CSV保存 =============
    fps_data = []
    for method_name, time_list in timings.items():
        if not time_list:
            continue
        total_time = sum(time_list)
        num_calls = len(time_list)
        avg_time = total_time / num_calls if num_calls > 0 else 0
        fps = 1.0 / avg_time if avg_time > 0 else 0
        fps_data.append(
            {
                "method_name": method_name,
                "num_calls": num_calls,
                "total_time_sec": f"{total_time:.6f}",
                "avg_time_sec": f"{avg_time:.6f}",
                "fps": f"{fps:.2f}",
            }
        )

    fps_csv_path = os.path.join(results_dir, "fps_results.csv")
    with open(fps_csv_path, "w", newline="") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=[
                "method_name",
                "num_calls",
                "total_time_sec",
                "avg_time_sec",
                "fps",
            ],
        )
        writer.writeheader()
        for rowf in fps_data:
            writer.writerow(rowf)

    print("\n===== FPS Results (subcomponent & pipeline) =====")
    for rowf in fps_data:
        print(
            f"{rowf['method_name']}: calls={rowf['num_calls']}, "
            f"total={rowf['total_time_sec']}s, avg={rowf['avg_time_sec']}s, FPS={rowf['fps']}"
        )


###############################################################################
# 可視化・動画化 (描画時間はFPSに含めない)
###############################################################################
def generate_visualizations(csv_path, original_images_dir, results_dir):
    """
    CSVを読み込み、BodyF.png上や元フレーム上に各手法の結果を描画 → 動画化。
    描画時間はFPSに含めず、ここでまとめて行う。

    もともとの marked_images（姿勢＋聴診器）に加え、
    ・姿勢推定だけ描画した `marked_pose_images`
    ・聴診器検出だけ描画した `marked_stethoscope_images`
    も生成＆動画化する。

    姿勢推定の色 = (33,95,154), 聴診器検出の色 = (19,80,27)
    各マーカーには光彩風の枠をつけて視認性を上げる。
    """
    df = pd.read_csv(csv_path)
    body_image_path = "./images/body/BodyF.png"
    if not os.path.exists(body_image_path):
        print(f"Warning: {body_image_path} not found.")
        return

    # Pillow(RGB)で開く
    body_img_pil = Image.open(body_image_path).convert("RGB")
    body_np_rgb = np.array(body_img_pil)  # RGB順

    dirs = {"marked": "marked_images"}
    if CONV_ENABLED:
        dirs["conv"] = "conv"
    if XGBOOST_ENABLED:
        dirs["Xgboost"] = "Xgboost"
    if LIGHTGBM_ENABLED:
        dirs["lightGBM"] = "lightGBM"
    if CATBOOST_ENABLED:
        dirs["catboost"] = "catboost"
    if NGBOOST_ENABLED:
        dirs["ngboost"] = "ngboost"
    if EARSNET_ENABLED:
        dirs["earsnet"] = "earsnet"
    if EARSNET_CROP_ENABLED:
        dirs["earsnet_crop"] = "earsnet_crop"
    dirs["combined"] = "combined"

    # 追加フォルダ
    pose_only_dir = "marked_pose_images"
    stetho_only_dir = "marked_stethoscope_images"

    os.makedirs(os.path.join(results_dir, "marked_images"), exist_ok=True)
    os.makedirs(os.path.join(results_dir, pose_only_dir), exist_ok=True)
    os.makedirs(os.path.join(results_dir, stetho_only_dir), exist_ok=True)

    for key in dirs:
        if key != "marked":
            os.makedirs(
                os.path.join(results_dir, f"{dirs[key]}_with_trajectory"), exist_ok=True
            )
            os.makedirs(
                os.path.join(results_dir, f"{dirs[key]}_without_trajectory"),
                exist_ok=True,
            )

    points = {key: [] for key in dirs.keys() if key not in ["marked", "combined"]}

    colors = {
        "conv": CONV_COLOR,
        "Xgboost": XGBOOST_COLOR,
        "lightGBM": LIGHTGBM_COLOR,
        "catboost": CATBOOST_COLOR,
        "ngboost": NGBOOST_COLOR,
        "earsnet": EARSNET_COLOR,
        "earsnet_crop": (255, 51, 255),
    }

    # Pose color = (33,95,154), Stetho color = (19,80,27)
    pose_color_rgb = (33, 95, 154)
    stetho_color_rgb = (19, 80, 27)

    def draw_glow_marker(draw, center, main_color, radius=5):
        outer_radius = radius + 3
        x, y = int(center[0]), int(center[1])
        # 白枠
        pillow_draw_circle(
            draw, (x, y), outer_radius, fill=None, outline=(255, 255, 255), width=2
        )
        # 中心塗りつぶし
        pillow_draw_circle(draw, (x, y), radius, fill=main_color)

    for _, row in df.iterrows():
        original_image_path = os.path.join(original_images_dir, row["image_file_name"])
        if not os.path.exists(original_image_path):
            continue
        original_image = cv2.imread(original_image_path)
        if original_image is None:
            continue

        # 1) marked_images（肩/腰/聴診器）
        pil_marked = Image.fromarray(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
        draw_marked = ImageDraw.Draw(pil_marked)

        # 肩・腰・聴診器
        for point in [
            "left_shoulder",
            "right_shoulder",
            "left_hip",
            "right_hip",
            "stethoscope",
        ]:
            col_x = f"{point}_x"
            col_y = f"{point}_y"
            if (
                col_x in row
                and col_y in row
                and not pd.isna(row[col_x])
                and not pd.isna(row[col_y])
            ):
                x, y = int(row[col_x]), int(row[col_y])
                draw_glow_marker(
                    draw_marked, (x, y), main_color=(255, 255, 0), radius=5
                )

        marked_rgb = np.array(pil_marked)
        marked_bgr = cv2.cvtColor(marked_rgb, cv2.COLOR_RGB2BGR)
        marked_dir = os.path.join(results_dir, "marked_images")
        cv2.imwrite(os.path.join(marked_dir, row["image_file_name"]), marked_bgr)

        # 2) marked_pose_images（姿勢のみ）
        pil_pose = Image.fromarray(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
        draw_pose = ImageDraw.Draw(pil_pose)

        for pose_point in ["left_shoulder", "right_shoulder", "left_hip", "right_hip"]:
            col_x = f"{pose_point}_x"
            col_y = f"{pose_point}_y"
            if (
                col_x in row
                and col_y in row
                and not pd.isna(row[col_x])
                and not pd.isna(row[col_y])
            ):
                x, y = int(row[col_x]), int(row[col_y])
                draw_glow_marker(
                    draw_pose, (x, y), main_color=pose_color_rgb, radius=15
                )

        pose_rgb = np.array(pil_pose)
        pose_bgr = cv2.cvtColor(pose_rgb, cv2.COLOR_RGB2BGR)
        pose_dir_path = os.path.join(results_dir, pose_only_dir)
        cv2.imwrite(os.path.join(pose_dir_path, row["image_file_name"]), pose_bgr)

        # 3) marked_stethoscope_images（聴診器のみ）
        pil_stetho = Image.fromarray(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
        draw_stetho = ImageDraw.Draw(pil_stetho)

        if (
            "stethoscope_x" in row
            and "stethoscope_y" in row
            and not pd.isna(row["stethoscope_x"])
            and not pd.isna(row["stethoscope_y"])
        ):
            sx, sy = int(row["stethoscope_x"]), int(row["stethoscope_y"])
            if sx != 0 or sy != 0:
                draw_glow_marker(
                    draw_stetho, (sx, sy), main_color=stetho_color_rgb, radius=15
                )

        stetho_rgb = np.array(pil_stetho)
        stetho_bgr = cv2.cvtColor(stetho_rgb, cv2.COLOR_RGB2BGR)
        stetho_dir_path = os.path.join(results_dir, stetho_only_dir)
        cv2.imwrite(os.path.join(stetho_dir_path, row["image_file_name"]), stetho_bgr)

        # 4) combined系
        combined_image_with_traj_rgb = body_np_rgb.copy()
        combined_image_without_traj_rgb = body_np_rgb.copy()

        pil_with_traj = Image.fromarray(combined_image_with_traj_rgb)
        pil_without_traj = Image.fromarray(combined_image_without_traj_rgb)
        draw_with_traj = ImageDraw.Draw(pil_with_traj)
        draw_without_traj = ImageDraw.Draw(pil_without_traj)

        for key in points.keys():
            col_x = f"{key}_stethoscope_x"
            col_y = f"{key}_stethoscope_y"
            if col_x not in row or col_y not in row:
                continue
            if pd.isna(row[col_x]) or pd.isna(row[col_y]):
                continue

            x, y = int(row[col_x]), int(row[col_y])
            points[key].append((x, y))

            color = colors[key] if key in colors else (0, 0, 255)

            # 個別 with trajectory
            indiv_with_traj_rgb = body_np_rgb.copy()
            pil_indiv_with = Image.fromarray(indiv_with_traj_rgb)
            draw_indiv_with = ImageDraw.Draw(pil_indiv_with)

            if len(points[key]) > 1:
                pillow_draw_polyline(draw_indiv_with, points[key], color=color, width=2)
            draw_glow_marker(draw_indiv_with, (x, y), main_color=color, radius=8)

            indiv_with_traj_np = np.array(pil_indiv_with)
            indiv_with_traj_bgr = cv2.cvtColor(indiv_with_traj_np, cv2.COLOR_RGB2BGR)
            out_path_with = os.path.join(
                results_dir, f"{dirs[key]}_with_trajectory", row["image_file_name"]
            )
            cv2.imwrite(out_path_with, indiv_with_traj_bgr)

            # 個別 without trajectory
            indiv_without_traj_rgb = body_np_rgb.copy()
            pil_indiv_without = Image.fromarray(indiv_without_traj_rgb)
            draw_indiv_without = ImageDraw.Draw(pil_indiv_without)
            draw_glow_marker(draw_indiv_without, (x, y), main_color=color, radius=8)

            indiv_without_traj_np = np.array(pil_indiv_without)
            indiv_without_traj_bgr = cv2.cvtColor(
                indiv_without_traj_np, cv2.COLOR_RGB2BGR
            )
            out_path_without = os.path.join(
                results_dir, f"{dirs[key]}_without_trajectory", row["image_file_name"]
            )
            cv2.imwrite(out_path_without, indiv_without_traj_bgr)

            # combined with trajectory
            if len(points[key]) > 1:
                pillow_draw_polyline(draw_with_traj, points[key], color=color, width=2)
            draw_glow_marker(draw_with_traj, (x, y), main_color=color, radius=8)

            # combined without trajectory
            draw_glow_marker(draw_without_traj, (x, y), main_color=color, radius=8)

        cwt_np = np.array(pil_with_traj)
        cwt_bgr = cv2.cvtColor(cwt_np, cv2.COLOR_RGB2BGR)
        cwd = os.path.join(results_dir, "combined_with_trajectory")
        os.makedirs(cwd, exist_ok=True)
        cv2.imwrite(os.path.join(cwd, row["image_file_name"]), cwt_bgr)

        cwo_np = np.array(pil_without_traj)
        cwo_bgr = cv2.cvtColor(cwo_np, cv2.COLOR_RGB2BGR)
        cwod = os.path.join(results_dir, "combined_without_trajectory")
        os.makedirs(cwod, exist_ok=True)
        cv2.imwrite(os.path.join(cwod, row["image_file_name"]), cwo_bgr)

    # 動画化
    create_video_from_images(
        os.path.join(results_dir, "marked_images"),
        os.path.join(results_dir, "marked_video.mp4"),
    )

    create_video_from_images(
        os.path.join(results_dir, pose_only_dir),
        os.path.join(results_dir, "pose_video.mp4"),
    )
    create_video_from_images(
        os.path.join(results_dir, stetho_only_dir),
        os.path.join(results_dir, "stethoscope_video.mp4"),
    )

    for key in dirs:
        if key not in ["marked", "combined"]:
            create_video_from_images(
                os.path.join(results_dir, f"{dirs[key]}_with_trajectory"),
                os.path.join(results_dir, f"{key}_video_with_trajectory.mp4"),
            )
            create_video_from_images(
                os.path.join(results_dir, f"{dirs[key]}_without_trajectory"),
                os.path.join(results_dir, f"{key}_video_without_trajectory.mp4"),
            )


def create_video_from_images(image_dir, output_path):
    if not os.path.exists(image_dir):
        return
    images = sorted(
        [img for img in os.listdir(image_dir) if img.endswith(".png")],
        key=lambda x: int(re.search(r"(\d+)", x).group()),
    )

    if not images:
        print(f"No images found in {image_dir}")
        return

    frame = cv2.imread(os.path.join(image_dir, images[0]))
    if frame is None:
        print(f"Failed to read the first image in {image_dir}")
        return
    height, width, _ = frame.shape

    video = cv2.VideoWriter(
        output_path, cv2.VideoWriter_fourcc(*"mp4v"), 30, (width, height)
    )

    for image in images:
        img_path = os.path.join(image_dir, image)
        img = cv2.imread(img_path)
        if img is not None:
            video.write(img)

    video.release()
    print(f"Created video: {output_path}")


###############################################################################
# メイン
###############################################################################
def main():
    parser = argparse.ArgumentParser(description="Process video and generate results.")
    parser.add_argument(
        "--video_path",
        default="./video/tes.mp4",
        help="Path to the input video file",
    )
    parser.add_argument(
        "--output_dir",
        default="output",
        help="Directory to save output images and results",
    )

    # RTMpose用
    det_config = "modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py"
    det_checkpoint = (
        "models/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth"
    )
    pose_config = (
        "modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/"
        "rtmpose-l_8xb256-420e_body8-256x192.py"
    )
    pose_checkpoint = "models/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth"

    args = parser.parse_args()
    os.makedirs(args.output_dir, exist_ok=True)

    # (1) FPSモニタースレッド (推論のみ計測)
    fps_thread = Thread(target=fps_monitor, args=(1.0,), daemon=True)
    fps_thread.start()

    # (2) 動画→フレーム
    frames_dir = os.path.join(args.output_dir, "frames")
    video_to_frames(args.video_path, frames_dir)

    # (3) RTMposeの初期化 (必要であれば)
    if RTMPOSE_ENABLED:
        detector = init_detector(det_config, det_checkpoint, device=DEVICE)
        detector.cfg = adapt_mmdet_pipeline(detector.cfg)
        pose_estimator = init_pose_estimator(
            pose_config, pose_checkpoint, device=DEVICE
        )
        visualizer = VISUALIZERS.build(pose_estimator.cfg.visualizer)
        visualizer.set_dataset_meta(
            pose_estimator.dataset_meta, skeleton_style="mmpose"
        )
        process_images(args, detector, pose_estimator, visualizer)
    else:
        process_images(args, None, None, None)

    # (4) スレッド終了
    global stop_fps_thread
    stop_fps_thread = True
    fps_thread.join()

    print("All done.")


if __name__ == "__main__":
    main()