diff --git a/config.py b/config.py index 62d026d..4c46e69 100644 --- a/config.py +++ b/config.py @@ -22,6 +22,7 @@ RTMPOSE_ENABLED = True MOBILENETV1SSD_ENABLED = False YOLOX_ENABLED = True +EARSNET_CROP_ENABLED = True # Neural network model settings EARSNET_ENABLED = True diff --git a/main.py b/main.py index 89e0bf0..81ed270 100644 --- a/main.py +++ b/main.py @@ -11,7 +11,7 @@ import pandas as pd from mmdet.apis import DetInferencer, inference_detector, init_detector -# New imports for RTMPose +# RTMpose from mmpose.apis import inference_topdown from mmpose.apis import init_model as init_pose_estimator from mmpose.evaluation.functional import nms @@ -21,12 +21,16 @@ import config -# EARSNetPredictor のみをインポート +# EARSNet from modules.EARSNet.predictor import EARSNetPredictor + +# Utilities from util.calc_ste_position import CalcStethoscopePosition from util.ears_ai import EarsAI -# Get colors from config +############################################################################### +# Config 値を参照 +############################################################################### CONV_COLOR = config.CONV_COLOR XGBOOST_COLOR = config.XGBOOST_COLOR LIGHTGBM_COLOR = config.LIGHTGBM_COLOR @@ -34,7 +38,6 @@ CATBOOST_COLOR = config.CATBOOST_COLOR NGBOOST_COLOR = config.NGBOOST_COLOR -# Get model execution settings CONV_ENABLED = config.CONV_ENABLED XGBOOST_ENABLED = config.XGBOOST_ENABLED LIGHTGBM_ENABLED = config.LIGHTGBM_ENABLED @@ -46,16 +49,18 @@ YOLOX_ENABLED = config.YOLOX_ENABLED EARSNET_ENABLED = config.EARSNET_ENABLED -# Get normalization setting +# ★ クロップ画像を使う EARSNet (別モデル) を使うかどうか +EARSNET_CROP_ENABLED = config.EARSNET_CROP_ENABLED + NORMALIZE_ENABLED = config.NORMALIZE_ENABLED +DEVICE = config.DEVICE # "cuda" or "cpu" など + ############################################################################### # リアルタイムFPS計測用のグローバル変数&スレッド定義 ############################################################################### processed_frames = 0 # 処理済みフレーム数(メインスレッドでインクリメント) stop_fps_thread = False # スレッド終了フラグ - -# 必要に応じてリアルタイムFPSの履歴を保存するリスト (後でCSV化したい場合) fps_history = [] @@ -86,16 +91,14 @@ f"[FPS Monitor] Real-time FPS: {current_fps:.2f} (frames: +{frames_delta})" ) - # 履歴を残したい場合は下記を使用 fps_history.append((now, current_fps)) - # カウント更新 last_count = current_count last_time = now ############################################################################### -# 以下は従来の処理 (姿勢推定、聴診器検出、FPS計測など) +# モデルロード系 ############################################################################### def load_model(model_path, model_type="lgb"): with open(model_path, "rb") as model_file: @@ -107,6 +110,9 @@ return pickle.load(f) +############################################################################### +# YOLOX +############################################################################### def init_yolox(): try: from mmengine.registry import DefaultScope @@ -116,9 +122,8 @@ init_args = { "model": config.YOLOX_CONFIG_FILE, "weights": config.YOLOX_CHECKPOINT_FILE, - "device": config.DEVICE, + "device": DEVICE, } - yolox_inferencer = DetInferencer(**init_args) return yolox_inferencer @@ -140,6 +145,9 @@ def expand_points(p1, p2): + """ + 2点を中央から外側に拡張(肩や腰の領域を拡大する用途)するヘルパー関数 + """ mid_x = (p1[0] + p2[0]) / 2 mid_y = (p1[1] + p2[1]) / 2 @@ -180,7 +188,7 @@ stethoscope_y = None max_score = -1 - # keypoints 配列から部位を取得 + # keypoints 配列から部位を取得 (COCOフォーマット想定) nose = pose_keypoints[0] left_shoulder = pose_keypoints[5] right_shoulder = pose_keypoints[6] @@ -206,7 +214,7 @@ for i, (label, score) in enumerate( zip(predictions["labels"], predictions["scores"]) ): - # label=0 → 聴診器と仮定 (実際は学習クラスによって要変更) + # label=0 → 聴診器と仮定 (学習済みクラスのラベルに合わせる) if score >= score_thr and label == 0: bbox = predictions["bboxes"][i] center_x = (bbox[0] + bbox[2]) / 2 @@ -238,6 +246,9 @@ return stethoscope_overlay_img, stethoscope_x, stethoscope_y +############################################################################### +# 各種座標変換 +############################################################################### def normalize_quadrilateral_with_point(points, extra_point): """4点(肩・肩・腰・腰)と任意の1点(聴診器)を正規化して返す。""" all_points = np.vstack([points.reshape(-1, 2), extra_point]) @@ -263,6 +274,7 @@ ) if max_edge_length == 0: return rotated_points # 0割り防止 + return rotated_points / max_edge_length @@ -289,6 +301,9 @@ print(f"All frames saved to {output_dir}") +############################################################################### +# RTMpose キーポイント抽出 +############################################################################### def extract_keypoints_rtmpose(pose_results): if not pose_results: print("No pose results found.") @@ -312,16 +327,48 @@ return keypoints +############################################################################### +# 胴体クロップ生成 +############################################################################### +def crop_body_from_keypoints(frame, left_shoulder, right_shoulder, left_hip, right_hip): + """ + RTMPOSE 等で推定された肩・腰をもとに胴体をざっくり囲むバウンディングボックスを計算し、 + そこをクロップして返す。 + 戻り値: (cropped_frame, (xmin, ymin)) + cropped_frame: クロップ後の画像 (np.ndarray) + (xmin, ymin): クロップ領域の左上座標 (元画像座標系へのマッピング用) + """ + h, w, _ = frame.shape + + # 左右肩・左右腰 4点の x, y + xs = [left_shoulder[0], right_shoulder[0], left_hip[0], right_hip[0]] + ys = [left_shoulder[1], right_shoulder[1], left_hip[1], right_hip[1]] + + xmin = int(min(xs)) + xmax = int(max(xs)) + ymin = int(min(ys)) + ymax = int(max(ys)) + + # 多少のマージンを足す (上下左右に 20 ピクセルなど) + margin = 20 + xmin = max(0, xmin - margin) + xmax = min(w, xmax + margin) + ymin = max(0, ymin - margin) + ymax = min(h, ymax + margin) + + cropped_frame = frame[ymin:ymax, xmin:xmax].copy() + + return cropped_frame, (xmin, ymin) + + +############################################################################### +# メイン処理 +############################################################################### def process_images(args, detector, pose_estimator, visualizer): - """ - メインスレッドでフレームごとの推論を行う。 - 別スレッドでリアルタイムFPSを計測しているため、 - フレーム処理終了後に processed_frames をインクリメントする。 - """ - print("Starting process_images function...") - global processed_frames # 別スレッドと共有 + global processed_frames ears_ai = EarsAI() calc_position = CalcStethoscopePosition() + base_dir = os.path.join(args.output_dir, "frames") results_dir = os.path.join(args.output_dir, "results") csv_path = os.path.join(results_dir, "results.csv") @@ -329,6 +376,10 @@ pose_overlay_dir = os.path.join(results_dir, "pose_overlay_image") stethoscope_overlay_dir = os.path.join(results_dir, "stethoscope_overlay_image") + # クロップ画像を保存するディレクトリを作成 + cropped_dir = os.path.join(results_dir, "cropped_images") + os.makedirs(cropped_dir, exist_ok=True) + os.makedirs(results_dir, exist_ok=True) os.makedirs(pose_overlay_dir, exist_ok=True) os.makedirs(stethoscope_overlay_dir, exist_ok=True) @@ -337,21 +388,21 @@ [f for f in os.listdir(base_dir) if f.lower().endswith(".png")], key=lambda x: int(re.search(r"(\d+)", x).group(1)), ) - print(f"Found {len(png_files)} PNG files.") + print(f"Found {len(png_files)} PNG files in {base_dir}.") rows = [] normalized_rows = [] - # ---------------- + # ------------------------------------------ # YOLOX 初期化 - # ---------------- + # ------------------------------------------ yolox_inferencer = None if YOLOX_ENABLED: yolox_inferencer = init_yolox() - # ---------------- + # ------------------------------------------ # 時間計測用 dict - # ---------------- + # ------------------------------------------ timings = { # 単体推論 "rtmpose_single": [], @@ -360,26 +411,31 @@ "lightgbm_single": [], "xgboost_single": [], "earsnet_single": [], - # パイプライン推論 (RTMPose+YOLOX → 各モデル) + "earsnet_cropped_single": [], + # パイプライン推論 "pipeline_rtmpose_yolox_conv": [], "pipeline_rtmpose_yolox_lightgbm": [], "pipeline_rtmpose_yolox_xgboost": [], - "pipeline_earsnet": [], + # 今回修正 + "pipeline_earsnet": [], # EARSNet 単体 + "pipeline_earsnet_cropped": [], # RTMPose + EARSNet(クロップ) } - # ---------------- + # ------------------------------------------ # 各モデルの事前ロード - # ---------------- + # ------------------------------------------ if LIGHTGBM_ENABLED: lgb_model_x = load_model("./models/LightGBM/stethoscope_calc_x_best_model.pkl") lgb_model_y = load_model("./models/LightGBM/stethoscope_calc_y_best_model.pkl") lgb_scaler_x = load_scaler("./models/LightGBM/scaler-x.pkl") lgb_scaler_y = load_scaler("./models/LightGBM/scaler-y.pkl") + if XGBOOST_ENABLED: xg_model_x = load_model("./models/XGBoost/stethoscope_calc_x_best_model.pkl") xg_model_y = load_model("./models/XGBoost/stethoscope_calc_y_best_model.pkl") xg_scaler_x = load_scaler("./models/XGBoost/scaler-x.pkl") xg_scaler_y = load_scaler("./models/XGBoost/scaler-y.pkl") + if CATBOOST_ENABLED: catboost_model_x = load_model( "./models/CatBoost/stethoscope_calc_x_best_model.pkl" @@ -387,6 +443,7 @@ catboost_model_y = load_model( "./models/CatBoost/stethoscope_calc_y_best_model.pkl" ) + if NGBOOST_ENABLED: ngboost_model_x = load_model( "./models/NGBoost/stethoscope_calc_x_best_model.pkl" @@ -395,15 +452,25 @@ "./models/NGBoost/stethoscope_calc_y_best_model.pkl" ) - # EARSNET を使用する場合のみ初期化 + # 通常 EARSNet (クロップなし) if EARSNET_ENABLED: earsnet_predictor = EARSNetPredictor( weight_path="models/EARSNet/best_model.pth", - resnet_depth="18", # 学習時と同じResNet深度 - pretrained=True, # 学習時の設定に合わせる - device="cuda", # or "cpu" + resnet_depth="18", + pretrained=True, + device=DEVICE, ) + # クロップ画像用 EARSNet (別モデル) + if EARSNET_CROP_ENABLED: + earsnet_cropped_predictor = EARSNetPredictor( + weight_path="models/EARSNet/crop/best_model.pth", # 想定モデルファイル + resnet_depth="18", + pretrained=True, + device=DEVICE, + ) + + # CSVで使用する列 input_columns = [ "left_shoulder_x", "left_shoulder_y", @@ -427,37 +494,15 @@ print(f"Failed to load image: {image_path}") continue - # (A) -- 姿勢推定(RTMPose or PoseNet) & YOLOX 推論までの時間測定の準備 - pipeline_detection_start = time.time() - - # ============================================================ - # (1) PoseNet or RTMPOSE による姿勢推定(肩・腰座標取得) - # ============================================================ - left_shoulder = (0, 0) - right_shoulder = (0, 0) - left_hip = (0, 0) - right_hip = (0, 0) - pose_overlay_img = frame.copy() - - if POSENET_ENABLED: - # ▼ PoseNet - start_time_pose = time.time() - pose_overlay_img, *landmarks = ears_ai.pose_detect(frame, None) - end_time_pose = time.time() - timings["rtmpose_single"].append(end_time_pose - start_time_pose) - - left_shoulder = landmarks[0] - right_shoulder = landmarks[1] - left_hip = landmarks[2] - right_hip = landmarks[3] - - elif RTMPOSE_ENABLED: - # ▼ RTMPOSE + # (A) RTMPose + rtmpose_time = 0.0 + if RTMPOSE_ENABLED: + start_time_rtmpose = time.time() + # ===== RTMpose推論 ===== frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - - start_time_pose = time.time() det_result = inference_detector(detector, frame_rgb) pred_instance = det_result.pred_instances.cpu().numpy() + bboxes = np.concatenate( (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1 ) @@ -466,24 +511,20 @@ np.logical_and(pred_instance.labels == 0, pred_instance.scores > 0.3) ] bboxes = bboxes[nms(bboxes, 0.3), :4] + pose_results = inference_topdown(pose_estimator, frame_rgb, bboxes) data_samples = merge_data_samples(pose_results) pose_keypoints = extract_keypoints_rtmpose(pose_results) - end_time_pose = time.time() - timings["rtmpose_single"].append(end_time_pose - start_time_pose) + end_time_rtmpose = time.time() + rtmpose_time = end_time_rtmpose - start_time_rtmpose + timings["rtmpose_single"].append(rtmpose_time) if pose_keypoints is None: print(f"Failed to extract keypoints for image: {image_path}") processed_frames += 1 continue - left_shoulder = pose_keypoints[5] - right_shoulder = pose_keypoints[6] - left_hip = pose_keypoints[11] - right_hip = pose_keypoints[12] - - # 可視化 if visualizer is not None: visualizer.add_datasample( "result", @@ -499,21 +540,45 @@ kpt_thr=0.3, ) pose_overlay_img = visualizer.get_image() # (RGB) - - # ============================================================ - # (2) YOLOX or SSD で聴診器の推定(必要に応じて) - # ============================================================ - stethoscope_overlay_img = frame.copy() - stethoscope_x = 0 - stethoscope_y = 0 - - if MobileNetV1SSD_ENABLED: - start_time_ssd = time.time() - stethoscope_overlay_img, stethoscope_x, stethoscope_y = ears_ai.ssd_detect( - frame, None + pose_overlay_bgr = cv2.cvtColor(pose_overlay_img, cv2.COLOR_RGB2BGR) + cv2.imwrite( + os.path.join(pose_overlay_dir, image_file_name), pose_overlay_bgr ) - end_time_ssd = time.time() + # COCOフォーマットのキーポイントを取り出す + left_shoulder = (pose_keypoints[5][0], pose_keypoints[5][1]) + right_shoulder = (pose_keypoints[6][0], pose_keypoints[6][1]) + left_hip = (pose_keypoints[11][0], pose_keypoints[11][1]) + right_hip = (pose_keypoints[12][0], pose_keypoints[12][1]) + + elif POSENET_ENABLED: + # 既存 PoseNet + start_time_rtmpose = time.time() + pose_overlay_img, *landmarks = ears_ai.pose_detect(frame, None) + end_time_rtmpose = time.time() + rtmpose_time = end_time_rtmpose - start_time_rtmpose + timings["rtmpose_single"].append(rtmpose_time) + + # landmarks = [left_shoulder, right_shoulder, left_hip, right_hip] + left_shoulder = landmarks[0] + right_shoulder = landmarks[1] + left_hip = landmarks[2] + right_hip = landmarks[3] + + # pose_overlay_img はすでに BGR 形式想定 + cv2.imwrite( + os.path.join(pose_overlay_dir, image_file_name), pose_overlay_img + ) + else: + # RTMPose/PoseNet どちらも有効でない場合 + left_shoulder = (0, 0) + right_shoulder = (0, 0) + left_hip = (0, 0) + right_hip = (0, 0) + + # (B) YOLOX (必要なら) + yolox_time = 0.0 + stethoscope_x, stethoscope_y = 0, 0 if YOLOX_ENABLED: if ( RTMPOSE_ENABLED @@ -521,114 +586,119 @@ and pose_keypoints is not None ): start_time_yolox = time.time() - ( - stethoscope_overlay_img, - stethoscope_x, - stethoscope_y, - ) = yolox_detector_inference(frame, yolox_inferencer, pose_keypoints) + stethoscope_overlay_img, stethoscope_x, stethoscope_y = ( + yolox_detector_inference(frame, yolox_inferencer, pose_keypoints) + ) end_time_yolox = time.time() - timings["yolox_single"].append(end_time_yolox - start_time_yolox) + yolox_time = end_time_yolox - start_time_yolox + timings["yolox_single"].append(yolox_time) + + # 可視化 + cv2.imwrite( + os.path.join(stethoscope_overlay_dir, image_file_name), + stethoscope_overlay_img, + ) elif POSENET_ENABLED: + # PoseNet 用のキー配列に変換してYOLOX pose_keypoints_pose_net = [[0, 0]] * 13 - pose_keypoints_pose_net[5] = left_shoulder - pose_keypoints_pose_net[6] = right_shoulder - pose_keypoints_pose_net[11] = left_hip - pose_keypoints_pose_net[12] = right_hip + pose_keypoints_pose_net[5] = (left_shoulder[0], left_shoulder[1]) + pose_keypoints_pose_net[6] = (right_shoulder[0], right_shoulder[1]) + pose_keypoints_pose_net[11] = (left_hip[0], left_hip[1]) + pose_keypoints_pose_net[12] = (right_hip[0], right_hip[1]) start_time_yolox = time.time() - ( - stethoscope_overlay_img, - stethoscope_x, - stethoscope_y, - ) = yolox_detector_inference( - frame, yolox_inferencer, pose_keypoints_pose_net + stethoscope_overlay_img, stethoscope_x, stethoscope_y = ( + yolox_detector_inference( + frame, yolox_inferencer, pose_keypoints_pose_net + ) ) end_time_yolox = time.time() - timings["yolox_single"].append(end_time_yolox - start_time_yolox) + yolox_time = end_time_yolox - start_time_yolox + timings["yolox_single"].append(yolox_time) - # (A') -- RTMPose + YOLOX の検出処理終了時刻 (パイプライン計測用) - pipeline_detection_end = time.time() - detection_time = pipeline_detection_end - pipeline_detection_start - - # 可視化結果を保存 - if (RTMPOSE_ENABLED or POSENET_ENABLED) and ( - YOLOX_ENABLED or MobileNetV1SSD_ENABLED - ): - if RTMPOSE_ENABLED: + # 可視化 cv2.imwrite( - os.path.join(pose_overlay_dir, image_file_name), - cv2.cvtColor(pose_overlay_img, cv2.COLOR_RGB2BGR), - ) - else: - cv2.imwrite( - os.path.join(pose_overlay_dir, image_file_name), - pose_overlay_img, + os.path.join(stethoscope_overlay_dir, image_file_name), + stethoscope_overlay_img, ) - cv2.imwrite( - os.path.join(stethoscope_overlay_dir, image_file_name), - stethoscope_overlay_img, - ) + # ここで、(RTMPose + YOLOX) の合計検出時間をパイプラインに使う場合あり + detection_time_rtmpose_yolox = rtmpose_time + yolox_time - # ============================================================ - # (3) CSV用に肩・腰・聴診器座標をまとめる - # ============================================================ - if POSENET_ENABLED: - row = { - "image_file_name": image_file_name, - "left_shoulder_x": left_shoulder[1], - "left_shoulder_y": left_shoulder[0], - "right_shoulder_x": right_shoulder[1], - "right_shoulder_y": right_shoulder[0], - "left_hip_x": left_hip[1], - "left_hip_y": left_hip[0], - "right_hip_x": right_hip[1], - "right_hip_y": right_hip[0], - "stethoscope_x": stethoscope_x, - "stethoscope_y": stethoscope_y, - } - else: - row = { - "image_file_name": image_file_name, - "left_shoulder_x": left_shoulder[0], - "left_shoulder_y": left_shoulder[1], - "right_shoulder_x": right_shoulder[0], - "right_shoulder_y": right_shoulder[1], - "left_hip_x": left_hip[0], - "left_hip_y": left_hip[1], - "right_hip_x": right_hip[0], - "right_hip_y": right_hip[1], - "stethoscope_x": stethoscope_x, - "stethoscope_y": stethoscope_y, - } + # CSV用に座標をまとめる + row = { + "image_file_name": image_file_name, + "left_shoulder_x": left_shoulder[0], + "left_shoulder_y": left_shoulder[1], + "right_shoulder_x": right_shoulder[0], + "right_shoulder_y": right_shoulder[1], + "left_hip_x": left_hip[0], + "left_hip_y": left_hip[1], + "right_hip_x": right_hip[0], + "right_hip_y": right_hip[1], + "stethoscope_x": stethoscope_x, + "stethoscope_y": stethoscope_y, + } - # (C) EARSNET + # (C) EARSNet 単体 + # -> pipeline_earsnet は RTMPose, YOLOX を含まない if EARSNET_ENABLED: - start_earsnet = time.time() + start_time_earsnet = time.time() earsnet_x, earsnet_y = earsnet_predictor.predict(image_path) - end_earsnet = time.time() - timings["earsnet_single"].append(end_earsnet - start_earsnet) + end_time_earsnet = time.time() + + earsnet_time = end_time_earsnet - start_time_earsnet + timings["earsnet_single"].append(earsnet_time) + + # pipeline_earsnet = earsnet単体時間 + timings["pipeline_earsnet"].append(earsnet_time) row["earsnet_stethoscope_x"] = earsnet_x row["earsnet_stethoscope_y"] = earsnet_y - # EARSNETパイプライン時間 (単体処理として計測しておく) - pipeline_earsnet_time = end_earsnet - start_earsnet - timings["pipeline_earsnet"].append(pipeline_earsnet_time) + # (D) クロップ画像 EARSNet ( RTMPose + EARSNet_Cropped ) + if EARSNET_CROP_ENABLED: + # 1) クロップ生成 + cropped_img, (crop_xmin, crop_ymin) = crop_body_from_keypoints( + frame, left_shoulder, right_shoulder, left_hip, right_hip + ) + # クロップ画像を保存(確認用) + cropped_filename = os.path.splitext(image_file_name)[0] + "_cropped.png" + cv2.imwrite(os.path.join(cropped_dir, cropped_filename), cropped_img) - rows.append(row) + # 2) EARSNet (クロップ版) + start_time_earsnet_cropped = time.time() + earsnet_cropped_x, earsnet_cropped_y = earsnet_cropped_predictor.predict( + os.path.join(cropped_dir, cropped_filename) + ) + end_time_earsnet_cropped = time.time() - # ============================================================ - # (4) 正規化処理 (4点+聴診器) - # ============================================================ + earsnet_cropped_time = end_time_earsnet_cropped - start_time_earsnet_cropped + timings["earsnet_cropped_single"].append(earsnet_cropped_time) + + # pipeline_earsnet_cropped = RTMPose時間 + EARSNet(クロップ) + pipeline_earsnet_cropped_time = rtmpose_time + earsnet_cropped_time + timings["pipeline_earsnet_cropped"].append(pipeline_earsnet_cropped_time) + + # 3) 座標を元画像に変換 + global_x = earsnet_cropped_x + global_y = earsnet_cropped_y + + row["earsnet_crop_stethoscope_x"] = global_x + row["earsnet_crop_stethoscope_y"] = global_y + + # (E) 正規化 source_points = np.array( [ - [float(row[f"{pos}_x"]), float(row[f"{pos}_y"])] - for pos in ["left_shoulder", "right_shoulder", "left_hip", "right_hip"] + [float(row["left_shoulder_x"]), float(row["left_shoulder_y"])], + [float(row["right_shoulder_x"]), float(row["right_shoulder_y"])], + [float(row["left_hip_x"]), float(row["left_hip_y"])], + [float(row["right_hip_x"]), float(row["right_hip_y"])], ], dtype=np.float32, ) + stethoscope_point = np.array( [float(row["stethoscope_x"]), float(row["stethoscope_y"])] ) @@ -649,17 +719,43 @@ "stethoscope_x": normalized_points[4, 0], "stethoscope_y": normalized_points[4, 1], } - if EARSNET_ENABLED: - normalized_row["earsnet_stethoscope_x"] = row["earsnet_stethoscope_x"] - normalized_row["earsnet_stethoscope_y"] = row["earsnet_stethoscope_y"] + if EARSNET_ENABLED: + stetho_point_earsnet = np.array( + [ + float(row.get("earsnet_stethoscope_x", 0)), + float(row.get("earsnet_stethoscope_y", 0)), + ] + ) + norm_earsnet = normalize_quadrilateral_with_point( + source_points.flatten(), stetho_point_earsnet + ) + normalized_row["earsnet_stethoscope_x"] = norm_earsnet[4, 0] + normalized_row["earsnet_stethoscope_y"] = norm_earsnet[4, 1] + + if EARSNET_CROP_ENABLED: + stetho_point_crop = np.array( + [ + float(row.get("earsnet_crop_stethoscope_x", 0)), + float(row.get("earsnet_crop_stethoscope_y", 0)), + ] + ) + norm_earsnet_crop = normalize_quadrilateral_with_point( + source_points.flatten(), stetho_point_crop + ) + normalized_row["earsnet_crop_stethoscope_x"] = norm_earsnet_crop[4, 0] + normalized_row["earsnet_crop_stethoscope_y"] = norm_earsnet_crop[4, 1] + + rows.append(row) normalized_rows.append(normalized_row) - # (5) パイプライン推論 (Conv, LightGBM, XGBoost, etc.) + # (F) パイプライン (RTMPose+YOLOX → Conv/LightGBM/XGBoost) + # ここは従来通り: detection_time_rtmpose_yolox + 各モデル時間 + if RTMPOSE_ENABLED and YOLOX_ENABLED: # conv if CONV_ENABLED: - conv_start = time.time() + start_conv = time.time() source_pts = np.array( [ [float(row[f"{pos}_x"]), float(row[f"{pos}_y"])] @@ -675,49 +771,52 @@ stetho_pt = np.array( [float(row["stethoscope_x"]), float(row["stethoscope_y"])] ) - conv_stethoscope = calc_position.calc_affine(source_pts, *stetho_pt) - conv_end = time.time() - timings["conv_single"].append(conv_end - conv_start) + _ = calc_position.calc_affine(source_pts, *stetho_pt) + end_conv = time.time() + conv_time = end_conv - start_conv + timings["conv_single"].append(conv_time) - pipeline_time_conv = detection_time + (conv_end - conv_start) - timings["pipeline_rtmpose_yolox_conv"].append(pipeline_time_conv) + # pipeline_rtmpose_yolox_conv + timings["pipeline_rtmpose_yolox_conv"].append( + detection_time_rtmpose_yolox + conv_time + ) # XGBoost if XGBOOST_ENABLED: xg_start = time.time() - input_data_xg = ( - pd.DataFrame([normalized_rows[-1]]) - if NORMALIZE_ENABLED - else pd.DataFrame([rows[-1]]) - ) + if NORMALIZE_ENABLED: + input_data_xg = pd.DataFrame([normalized_rows[-1]]) + else: + input_data_xg = pd.DataFrame([rows[-1]]) X_scaled_x = xg_scaler_x.transform(input_data_xg[input_columns]) - xg_x_pred = int(xg_model_x.predict(X_scaled_x)[0]) + _ = xg_model_x.predict(X_scaled_x)[0] X_scaled_y = xg_scaler_y.transform(input_data_xg[input_columns]) - xg_y_pred = int(xg_model_y.predict(X_scaled_y)[0]) + _ = xg_model_y.predict(X_scaled_y)[0] xg_end = time.time() - timings["xgboost_single"].append(xg_end - xg_start) + xg_time = xg_end - xg_start + timings["xgboost_single"].append(xg_time) - pipeline_time_xgboost = detection_time + (xg_end - xg_start) - timings["pipeline_rtmpose_yolox_xgboost"].append(pipeline_time_xgboost) + timings["pipeline_rtmpose_yolox_xgboost"].append( + detection_time_rtmpose_yolox + xg_time + ) # LightGBM if LIGHTGBM_ENABLED: lgb_start = time.time() - input_data_lgb = ( - pd.DataFrame([normalized_rows[-1]]) - if NORMALIZE_ENABLED - else pd.DataFrame([rows[-1]]) - ) + if NORMALIZE_ENABLED: + input_data_lgb = pd.DataFrame([normalized_rows[-1]]) + else: + input_data_lgb = pd.DataFrame([rows[-1]]) X_scaled_x = lgb_scaler_x.transform(input_data_lgb[input_columns]) - lgb_x_pred = int(lgb_model_x.predict(X_scaled_x)[0]) + _ = lgb_model_x.predict(X_scaled_x)[0] X_scaled_y = lgb_scaler_y.transform(input_data_lgb[input_columns]) - lgb_y_pred = int(lgb_model_y.predict(X_scaled_y)[0]) + _ = lgb_model_y.predict(X_scaled_y)[0] lgb_end = time.time() - timings["lightgbm_single"].append(lgb_end - lgb_start) + lgb_time = lgb_end - lgb_start + timings["lightgbm_single"].append(lgb_time) - pipeline_time_lightgbm = detection_time + (lgb_end - lgb_start) timings["pipeline_rtmpose_yolox_lightgbm"].append( - pipeline_time_lightgbm + detection_time_rtmpose_yolox + lgb_time ) processed_frames += 1 @@ -726,42 +825,22 @@ # CSV 書き込み # ======================================================================== if rows: - print(f"Writing {len(rows)} rows to CSV...") fieldnames = list(rows[0].keys()) - if CONV_ENABLED: - fieldnames.extend(["conv_stethoscope_x", "conv_stethoscope_y"]) - if XGBOOST_ENABLED: - fieldnames.extend(["Xgboost_stethoscope_x", "Xgboost_stethoscope_y"]) - if LIGHTGBM_ENABLED: - fieldnames.extend(["lightGBM_stethoscope_x", "lightGBM_stethoscope_y"]) - if CATBOOST_ENABLED: - fieldnames.extend(["catboost_stethoscope_x", "catboost_stethoscope_y"]) - if NGBOOST_ENABLED: - fieldnames.extend(["ngboost_stethoscope_x", "ngboost_stethoscope_y"]) - csvfile_path = os.path.join(results_dir, "results.csv") normfile_path = os.path.join(results_dir, "results-convert.csv") + + os.makedirs(results_dir, exist_ok=True) + with ( open(csvfile_path, "w", newline="") as csvfile, open(normfile_path, "w", newline="") as norm_csvfile, ): writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - norm_writer = csv.DictWriter(norm_csvfile, fieldnames=fieldnames) writer.writeheader() - norm_writer.writeheader() - # 前回値を保持する辞書 (未検出時に使いたい場合) - prev_values = {} - if CONV_ENABLED: - prev_values["conv"] = (180, 180) - if LIGHTGBM_ENABLED: - prev_values["lightGBM"] = (180, 180) - if XGBOOST_ENABLED: - prev_values["Xgboost"] = (180, 180) - if CATBOOST_ENABLED: - prev_values["catboost"] = (180, 180) - if NGBOOST_ENABLED: - prev_values["ngboost"] = (180, 180) + norm_fieldnames = list(normalized_rows[0].keys()) + norm_writer = csv.DictWriter(norm_csvfile, fieldnames=norm_fieldnames) + norm_writer.writeheader() for row_, norm_row_ in zip(rows, normalized_rows): writer.writerow(row_) @@ -770,12 +849,13 @@ print(f"Processed and saved results to: {csvfile_path}") print(f"Processed and saved normalized results to: {normfile_path}") + # 可視化・動画化 generate_visualizations(csvfile_path, base_dir, results_dir) else: print("No data to write to CSV.") # ======================================================================== - # (6) FPS計算 & CSV保存 (サブコンポーネント&パイプラインごとの合計/平均) + # FPS計算 & CSV保存 (サブコンポーネント&パイプラインごとの合計/平均) # ======================================================================== fps_data = [] for method_name, time_list in timings.items(): @@ -819,14 +899,18 @@ ) +############################################################################### +# 可視化・動画化 +############################################################################### def generate_visualizations(csv_path, original_images_dir, results_dir): """ - NaNが混入した場合に描画でエラーにならないように修正。 - 聴診器の推論結果がNaNの場合は描画をスキップする。 + CSVに書き込んだ推定結果を用い、BodyF.pngへの描画や動画化を行う。 + EARSNetクロップ版の結果も描画できるように調整。 """ df = pd.read_csv(csv_path) body_image = cv2.imread("./images/body/BodyF.png") + # 生成ディレクトリ設定 dirs = {"marked": "marked_images"} if CONV_ENABLED: dirs["conv"] = "conv" @@ -840,6 +924,9 @@ dirs["ngboost"] = "ngboost" if EARSNET_ENABLED: dirs["earsnet"] = "earsnet" + if EARSNET_CROP_ENABLED: + dirs["earsnet_crop"] = "earsnet_crop" + dirs["combined"] = "combined" os.makedirs(os.path.join(results_dir, "marked_images"), exist_ok=True) @@ -853,7 +940,10 @@ exist_ok=True, ) - points = {key: [] for key in dirs.keys() if key != "marked"} + # 描画に使う座標列 + points = {key: [] for key in dirs.keys() if key not in ["marked", "combined"]} + + # 色設定 colors = { "conv": CONV_COLOR, "Xgboost": XGBOOST_COLOR, @@ -861,13 +951,13 @@ "catboost": CATBOOST_COLOR, "ngboost": NGBOOST_COLOR, "earsnet": EARSNET_COLOR, + "earsnet_crop": (255, 51, 255), # ピンク系 } for _, row in df.iterrows(): original_image_path = os.path.join(original_images_dir, row["image_file_name"]) if not os.path.exists(original_image_path): continue - original_image = cv2.imread(original_image_path) if original_image is None: continue @@ -885,7 +975,6 @@ if col_x in row and col_y in row: val_x = row[col_x] val_y = row[col_y] - # NaNチェック if pd.isna(val_x) or pd.isna(val_y): continue cv2.circle( @@ -895,46 +984,48 @@ (255, 255, 0), -1, ) - + # 保存 + marked_dir = os.path.join(results_dir, "marked_images") cv2.imwrite( - os.path.join(results_dir, "marked_images", row["image_file_name"]), + os.path.join(marked_dir, row["image_file_name"]), original_image, ) - # BodyF.png の上に軌跡を描画する + # BodyF.png に軌跡を描画 combined_image_with_traj = body_image.copy() combined_image_without_traj = body_image.copy() - for key in points: + # 各推定結果を描画 + for key in points.keys(): col_x = f"{key}_stethoscope_x" col_y = f"{key}_stethoscope_y" if col_x not in row or col_y not in row: continue - val_x = row[col_x] val_y = row[col_y] - # NaNであればスキップ if pd.isna(val_x) or pd.isna(val_y): continue x, y = int(val_x), int(val_y) points[key].append((x, y)) - # 1) 個別 with trajectory + color = colors[key] if key in colors else (0, 0, 255) + + # 個別 with trajectory image_with_trajectory = body_image.copy() if len(points[key]) > 1: cv2.polylines( image_with_trajectory, [np.array(points[key])], False, - colors.get(key, (0, 0, 255)), + color, 2, ) cv2.circle( image_with_trajectory, (x, y), 10, - colors.get(key, (0, 0, 255)), + color, -1, ) cv2.imwrite( @@ -944,13 +1035,13 @@ image_with_trajectory, ) - # 2) 個別 without trajectory + # 個別 without trajectory image_without_trajectory = body_image.copy() cv2.circle( image_without_trajectory, (x, y), 10, - colors.get(key, (0, 0, 255)), + color, -1, ) cv2.imwrite( @@ -962,28 +1053,28 @@ image_without_trajectory, ) - # 3) combined with trajectory + # combined with trajectory if len(points[key]) > 1: cv2.polylines( combined_image_with_traj, [np.array(points[key])], False, - colors.get(key, (0, 0, 255)), + color, 2, ) cv2.circle( combined_image_with_traj, (x, y), 10, - colors.get(key, (0, 0, 255)), + color, -1, ) - # 4) combined without trajectory + # combined without trajectory cv2.circle( combined_image_without_traj, (x, y), 10, - colors.get(key, (0, 0, 255)), + color, -1, ) @@ -1015,7 +1106,7 @@ ) for key in dirs: - if key != "marked": + if key not in ["marked", "combined"]: create_video_from_images( os.path.join(results_dir, f"{dirs[key]}_with_trajectory"), os.path.join(results_dir, f"{key}_video_with_trajectory.mp4"), @@ -1027,9 +1118,6 @@ def create_video_from_images(image_dir, output_path): - """ - 指定ディレクトリ内の PNG 画像を1つの動画に変換する - """ if not os.path.exists(image_dir): return images = sorted( @@ -1042,6 +1130,9 @@ return frame = cv2.imread(os.path.join(image_dir, images[0])) + if frame is None: + print(f"Failed to read the first image in {image_dir}") + return height, width, _ = frame.shape video = cv2.VideoWriter( @@ -1051,7 +1142,8 @@ for image in images: img_path = os.path.join(image_dir, image) img = cv2.imread(img_path) - video.write(img) + if img is not None: + video.write(img) video.release() print(f"Created video: {output_path}") @@ -1069,6 +1161,8 @@ default="output", help="Directory to save output images and results", ) + + # RTMpose 用の config & checkpoint (必要に応じて変更) det_config = "modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py" det_checkpoint = ( "models/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth" @@ -1083,26 +1177,20 @@ os.makedirs(args.output_dir, exist_ok=True) - # ------------------------- # 1) FPSモニタ用スレッド開始 - # ------------------------- fps_thread = Thread(target=fps_monitor, args=(1.0,), daemon=True) fps_thread.start() - # ------------------------- # 2) 動画をフレームに分割 - # ------------------------- frames_dir = os.path.join(args.output_dir, "frames") video_to_frames(args.video_path, frames_dir) - # ------------------------- - # 3) RTMPOSE初期化 (必要に応じて) - # ------------------------- + # 3) RTMPOSE初期化 (必要なときのみ) if RTMPOSE_ENABLED: - detector = init_detector(det_config, det_checkpoint, device="cuda:0") + detector = init_detector(det_config, det_checkpoint, device=DEVICE) detector.cfg = adapt_mmdet_pipeline(detector.cfg) pose_estimator = init_pose_estimator( - pose_config, pose_checkpoint, device="cuda:0" + pose_config, pose_checkpoint, device=DEVICE ) visualizer = VISUALIZERS.build(pose_estimator.cfg.visualizer) visualizer.set_dataset_meta( @@ -1113,9 +1201,7 @@ else: process_images(args, None, None, None) - # ------------------------- # 4) スレッド終了指示・join - # ------------------------- global stop_fps_thread stop_fps_thread = True fps_thread.join()