diff --git a/main.py b/main.py index 42aec28..89e0bf0 100644 --- a/main.py +++ b/main.py @@ -21,7 +21,7 @@ import config -# --- EARSNetPredictor のみをインポート --- +# EARSNetPredictor のみをインポート from modules.EARSNet.predictor import EARSNetPredictor from util.calc_ste_position import CalcStethoscopePosition from util.ears_ai import EarsAI @@ -206,7 +206,8 @@ for i, (label, score) in enumerate( zip(predictions["labels"], predictions["scores"]) ): - if score >= score_thr and label == 0: # label=0 を聴診器として判定 + # label=0 → 聴診器と仮定 (実際は学習クラスによって要変更) + if score >= score_thr and label == 0: bbox = predictions["bboxes"][i] center_x = (bbox[0] + bbox[2]) / 2 center_y = (bbox[1] + bbox[3]) / 2 @@ -260,6 +261,8 @@ np.roll(rotated_points[:4], -1, axis=0) - rotated_points[:4], axis=1 ) ) + if max_edge_length == 0: + return rotated_points # 0割り防止 return rotated_points / max_edge_length @@ -357,7 +360,7 @@ "lightgbm_single": [], "xgboost_single": [], "earsnet_single": [], - # パイプライン推論 + # パイプライン推論 (RTMPose+YOLOX → 各モデル) "pipeline_rtmpose_yolox_conv": [], "pipeline_rtmpose_yolox_lightgbm": [], "pipeline_rtmpose_yolox_xgboost": [], @@ -424,6 +427,9 @@ print(f"Failed to load image: {image_path}") continue + # (A) -- 姿勢推定(RTMPose or PoseNet) & YOLOX 推論までの時間測定の準備 + pipeline_detection_start = time.time() + # ============================================================ # (1) PoseNet or RTMPOSE による姿勢推定(肩・腰座標取得) # ============================================================ @@ -455,6 +461,7 @@ bboxes = np.concatenate( (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1 ) + # 人物のみ (label=0想定) bboxes = bboxes[ np.logical_and(pred_instance.labels == 0, pred_instance.scores > 0.3) ] @@ -500,7 +507,6 @@ stethoscope_x = 0 stethoscope_y = 0 - # SSD (MobileNetV1SSD) if MobileNetV1SSD_ENABLED: start_time_ssd = time.time() stethoscope_overlay_img, stethoscope_x, stethoscope_y = ears_ai.ssd_detect( @@ -508,7 +514,6 @@ ) end_time_ssd = time.time() - # YOLOX if YOLOX_ENABLED: if ( RTMPOSE_ENABLED @@ -516,9 +521,11 @@ and pose_keypoints is not None ): start_time_yolox = time.time() - (stethoscope_overlay_img, stethoscope_x, stethoscope_y) = ( - yolox_detector_inference(frame, yolox_inferencer, pose_keypoints) - ) + ( + stethoscope_overlay_img, + stethoscope_x, + stethoscope_y, + ) = yolox_detector_inference(frame, yolox_inferencer, pose_keypoints) end_time_yolox = time.time() timings["yolox_single"].append(end_time_yolox - start_time_yolox) @@ -530,17 +537,21 @@ pose_keypoints_pose_net[12] = right_hip start_time_yolox = time.time() - (stethoscope_overlay_img, stethoscope_x, stethoscope_y) = ( - yolox_detector_inference( - frame, yolox_inferencer, pose_keypoints_pose_net - ) + ( + stethoscope_overlay_img, + stethoscope_x, + stethoscope_y, + ) = yolox_detector_inference( + frame, yolox_inferencer, pose_keypoints_pose_net ) end_time_yolox = time.time() timings["yolox_single"].append(end_time_yolox - start_time_yolox) - # --------------------------------------------------------- - # 可視化結果を保存 (pose, stethoscope) - # --------------------------------------------------------- + # (A') -- RTMPose + YOLOX の検出処理終了時刻 (パイプライン計測用) + pipeline_detection_end = time.time() + detection_time = pipeline_detection_end - pipeline_detection_start + + # 可視化結果を保存 if (RTMPOSE_ENABLED or POSENET_ENABLED) and ( YOLOX_ENABLED or MobileNetV1SSD_ENABLED ): @@ -592,23 +603,25 @@ "stethoscope_y": stethoscope_y, } - # --------------------------------------------------------- - # (C) EARSNET (ここでフレーム毎に実行し、rowに書き込む) - # --------------------------------------------------------- + # (C) EARSNET if EARSNET_ENABLED: start_earsnet = time.time() earsnet_x, earsnet_y = earsnet_predictor.predict(image_path) - timings["earsnet_single"].append(time.time() - start_earsnet) + end_earsnet = time.time() + timings["earsnet_single"].append(end_earsnet - start_earsnet) - # row にEARSNET座標を格納 row["earsnet_stethoscope_x"] = earsnet_x row["earsnet_stethoscope_y"] = earsnet_y - # row を保存 - # --------------------------------------------------------- + # EARSNETパイプライン時間 (単体処理として計測しておく) + pipeline_earsnet_time = end_earsnet - start_earsnet + timings["pipeline_earsnet"].append(pipeline_earsnet_time) + rows.append(row) - # 正規化 + # ============================================================ + # (4) 正規化処理 (4点+聴診器) + # ============================================================ source_points = np.array( [ [float(row[f"{pos}_x"]), float(row[f"{pos}_y"])] @@ -623,18 +636,6 @@ source_points.flatten(), stethoscope_point ) - # EARSNet も同様に stethoscope_x,y を正規化 → ここでは省略例 - # しかし "耳" は別枠の場合もあるので、行うなら同様に対応 - # 例: earsnet_points = np.array([earsnet_x, earsnet_y]) - # ... で正規化etc. - - # 一旦 stethoscope用だけ - earsnet_x_n, earsnet_y_n = 0, 0 - if EARSNET_ENABLED: - # 224×224にリサイズしている場合、単純計算だけでなく - # ここでは "ただの合成例" として省略 - pass - normalized_row = { "image_file_name": image_file_name, "left_shoulder_x": normalized_points[0, 0], @@ -649,17 +650,80 @@ "stethoscope_y": normalized_points[4, 1], } if EARSNET_ENABLED: - normalized_row["earsnet_stethoscope_x"] = earsnet_x # 必要に応じて計算 - normalized_row["earsnet_stethoscope_y"] = earsnet_y + normalized_row["earsnet_stethoscope_x"] = row["earsnet_stethoscope_x"] + normalized_row["earsnet_stethoscope_y"] = row["earsnet_stethoscope_y"] normalized_rows.append(normalized_row) - # フレームごとの処理完了 + # (5) パイプライン推論 (Conv, LightGBM, XGBoost, etc.) + if RTMPOSE_ENABLED and YOLOX_ENABLED: + # conv + if CONV_ENABLED: + conv_start = time.time() + source_pts = np.array( + [ + [float(row[f"{pos}_x"]), float(row[f"{pos}_y"])] + for pos in [ + "left_shoulder", + "right_shoulder", + "left_hip", + "right_hip", + ] + ], + dtype=np.float32, + ) + stetho_pt = np.array( + [float(row["stethoscope_x"]), float(row["stethoscope_y"])] + ) + conv_stethoscope = calc_position.calc_affine(source_pts, *stetho_pt) + conv_end = time.time() + timings["conv_single"].append(conv_end - conv_start) + + pipeline_time_conv = detection_time + (conv_end - conv_start) + timings["pipeline_rtmpose_yolox_conv"].append(pipeline_time_conv) + + # XGBoost + if XGBOOST_ENABLED: + xg_start = time.time() + input_data_xg = ( + pd.DataFrame([normalized_rows[-1]]) + if NORMALIZE_ENABLED + else pd.DataFrame([rows[-1]]) + ) + X_scaled_x = xg_scaler_x.transform(input_data_xg[input_columns]) + xg_x_pred = int(xg_model_x.predict(X_scaled_x)[0]) + X_scaled_y = xg_scaler_y.transform(input_data_xg[input_columns]) + xg_y_pred = int(xg_model_y.predict(X_scaled_y)[0]) + xg_end = time.time() + timings["xgboost_single"].append(xg_end - xg_start) + + pipeline_time_xgboost = detection_time + (xg_end - xg_start) + timings["pipeline_rtmpose_yolox_xgboost"].append(pipeline_time_xgboost) + + # LightGBM + if LIGHTGBM_ENABLED: + lgb_start = time.time() + input_data_lgb = ( + pd.DataFrame([normalized_rows[-1]]) + if NORMALIZE_ENABLED + else pd.DataFrame([rows[-1]]) + ) + X_scaled_x = lgb_scaler_x.transform(input_data_lgb[input_columns]) + lgb_x_pred = int(lgb_model_x.predict(X_scaled_x)[0]) + X_scaled_y = lgb_scaler_y.transform(input_data_lgb[input_columns]) + lgb_y_pred = int(lgb_model_y.predict(X_scaled_y)[0]) + lgb_end = time.time() + timings["lightgbm_single"].append(lgb_end - lgb_start) + + pipeline_time_lightgbm = detection_time + (lgb_end - lgb_start) + timings["pipeline_rtmpose_yolox_lightgbm"].append( + pipeline_time_lightgbm + ) + processed_frames += 1 # ======================================================================== - # (5) 各フレームの位置推定(Conv, LightGBM, XGBoost, CatBoost, NGBoost) - # → CSV 書き込み + # CSV 書き込み # ======================================================================== if rows: print(f"Writing {len(rows)} rows to CSV...") @@ -674,9 +738,6 @@ fieldnames.extend(["catboost_stethoscope_x", "catboost_stethoscope_y"]) if NGBOOST_ENABLED: fieldnames.extend(["ngboost_stethoscope_x", "ngboost_stethoscope_y"]) - # EARSNETはすでに row に earsnet_stethoscope_x,y があるのでOK - - os.makedirs(results_dir, exist_ok=True) csvfile_path = os.path.join(results_dir, "results.csv") normfile_path = os.path.join(results_dir, "results-convert.csv") @@ -689,7 +750,7 @@ writer.writeheader() norm_writer.writeheader() - # 前回値を保持する辞書 + # 前回値を保持する辞書 (未検出時に使いたい場合) prev_values = {} if CONV_ENABLED: prev_values["conv"] = (180, 180) @@ -701,153 +762,10 @@ prev_values["catboost"] = (180, 180) if NGBOOST_ENABLED: prev_values["ngboost"] = (180, 180) - # EARSNETは前回値利用しないなら不要 - for row, norm_row in zip(rows, normalized_rows): - input_data = ( - pd.DataFrame([norm_row]) - if NORMALIZE_ENABLED - else pd.DataFrame([row]) - ) - - # 聴診器未検出の場合 - if row["stethoscope_x"] == 0 and row["stethoscope_y"] == 0: - # 省略: conv/lightgbm/xgboostなどで前回値代入 - for key in prev_values: - row[f"{key}_stethoscope_x"], row[f"{key}_stethoscope_y"] = ( - prev_values[key] - ) - ( - norm_row[f"{key}_stethoscope_x"], - norm_row[f"{key}_stethoscope_y"], - ) = prev_values[key] - else: - # conv - if CONV_ENABLED: - start_time_conv = time.time() - source_pts = np.array( - [ - [float(row[f"{pos}_x"]), float(row[f"{pos}_y"])] - for pos in [ - "left_shoulder", - "right_shoulder", - "left_hip", - "right_hip", - ] - ], - dtype=np.float32, - ) - stetho_pt = np.array( - [float(row["stethoscope_x"]), float(row["stethoscope_y"])] - ) - conv_stethoscope = calc_position.calc_affine( - source_pts, *stetho_pt - ) - row["conv_stethoscope_x"], row["conv_stethoscope_y"] = ( - conv_stethoscope - ) - ( - norm_row["conv_stethoscope_x"], - norm_row["conv_stethoscope_y"], - ) = conv_stethoscope - end_time_conv = time.time() - timings["conv_single"].append(end_time_conv - start_time_conv) - prev_values["conv"] = conv_stethoscope - - # LightGBM - if LIGHTGBM_ENABLED: - start_time_lgb = time.time() - X_scaled_x = lgb_scaler_x.transform(input_data[input_columns]) - lgb_x_pred = int(lgb_model_x.predict(X_scaled_x)[0]) - X_scaled_y = lgb_scaler_y.transform(input_data[input_columns]) - lgb_y_pred = int(lgb_model_y.predict(X_scaled_y)[0]) - row["lightGBM_stethoscope_x"], row["lightGBM_stethoscope_y"] = ( - lgb_x_pred, - lgb_y_pred, - ) - ( - norm_row["lightGBM_stethoscope_x"], - norm_row["lightGBM_stethoscope_y"], - ) = ( - lgb_x_pred, - lgb_y_pred, - ) - end_time_lgb = time.time() - timings["lightgbm_single"].append(end_time_lgb - start_time_lgb) - prev_values["lightGBM"] = (lgb_x_pred, lgb_y_pred) - - # XGBoost - if XGBOOST_ENABLED: - start_time_xgb = time.time() - X_scaled_x = xg_scaler_x.transform(input_data[input_columns]) - xg_x_pred = int(xg_model_x.predict(X_scaled_x)[0]) - X_scaled_y = xg_scaler_y.transform(input_data[input_columns]) - xg_y_pred = int(xg_model_y.predict(X_scaled_y)[0]) - row["Xgboost_stethoscope_x"], row["Xgboost_stethoscope_y"] = ( - xg_x_pred, - xg_y_pred, - ) - ( - norm_row["Xgboost_stethoscope_x"], - norm_row["Xgboost_stethoscope_y"], - ) = ( - xg_x_pred, - xg_y_pred, - ) - end_time_xgb = time.time() - timings["xgboost_single"].append(end_time_xgb - start_time_xgb) - prev_values["Xgboost"] = (xg_x_pred, xg_y_pred) - - # CatBoost - if CATBOOST_ENABLED: - start_time_cat = time.time() - catboost_x = int( - catboost_model_x.predict(input_data[input_columns])[0] - ) - catboost_y = int( - catboost_model_y.predict(input_data[input_columns])[0] - ) - row["catboost_stethoscope_x"], row["catboost_stethoscope_y"] = ( - catboost_x, - catboost_y, - ) - ( - norm_row["catboost_stethoscope_x"], - norm_row["catboost_stethoscope_y"], - ) = ( - catboost_x, - catboost_y, - ) - end_time_cat = time.time() - # timings["catboost_single"].append( ... ) # 必要なら追加 - prev_values["catboost"] = (catboost_x, catboost_y) - - # NGBoost - if NGBOOST_ENABLED: - start_time_ngb = time.time() - ngboost_x = int( - ngboost_model_x.predict(input_data[input_columns])[0] - ) - ngboost_y = int( - ngboost_model_y.predict(input_data[input_columns])[0] - ) - row["ngboost_stethoscope_x"], row["ngboost_stethoscope_y"] = ( - ngboost_x, - ngboost_y, - ) - ( - norm_row["ngboost_stethoscope_x"], - norm_row["ngboost_stethoscope_y"], - ) = ( - ngboost_x, - ngboost_y, - ) - end_time_ngb = time.time() - # timings["ngboost_single"].append( ... ) # 必要なら追加 - prev_values["ngboost"] = (ngboost_x, ngboost_y) - - writer.writerow(row) - norm_writer.writerow(norm_row) + for row_, norm_row_ in zip(rows, normalized_rows): + writer.writerow(row_) + norm_writer.writerow(norm_row_) print(f"Processed and saved results to: {csvfile_path}") print(f"Processed and saved normalized results to: {normfile_path}") @@ -890,18 +808,22 @@ ], ) writer.writeheader() - for row in fps_data: - writer.writerow(row) + for rowf in fps_data: + writer.writerow(rowf) print("\n===== FPS Results (subcomponent & pipeline) =====") - for row in fps_data: + for rowf in fps_data: print( - f"{row['method_name']}: calls={row['num_calls']}, total={row['total_time_sec']}s, " - f"avg={row['avg_time_sec']}s, FPS={row['fps']}" + f"{rowf['method_name']}: calls={rowf['num_calls']}, " + f"total={rowf['total_time_sec']}s, avg={rowf['avg_time_sec']}s, FPS={rowf['fps']}" ) def generate_visualizations(csv_path, original_images_dir, results_dir): + """ + NaNが混入した場合に描画でエラーにならないように修正。 + 聴診器の推論結果がNaNの場合は描画をスキップする。 + """ df = pd.read_csv(csv_path) body_image = cv2.imread("./images/body/BodyF.png") @@ -942,12 +864,15 @@ } for _, row in df.iterrows(): - original_image = cv2.imread( - os.path.join(original_images_dir, row["image_file_name"]) - ) + original_image_path = os.path.join(original_images_dir, row["image_file_name"]) + if not os.path.exists(original_image_path): + continue + + original_image = cv2.imread(original_image_path) if original_image is None: continue + # 肩・腰・聴診器などをマーキング for point in [ "left_shoulder", "right_shoulder", @@ -955,10 +880,17 @@ "right_hip", "stethoscope", ]: - if point + "_x" in row and point + "_y" in row: + col_x = f"{point}_x" + col_y = f"{point}_y" + if col_x in row and col_y in row: + val_x = row[col_x] + val_y = row[col_y] + # NaNチェック + if pd.isna(val_x) or pd.isna(val_y): + continue cv2.circle( original_image, - (int(row[f"{point}_x"]), int(row[f"{point}_y"])), + (int(val_x), int(val_y)), 10, (255, 255, 0), -1, @@ -969,16 +901,26 @@ original_image, ) + # BodyF.png の上に軌跡を描画する combined_image_with_traj = body_image.copy() combined_image_without_traj = body_image.copy() for key in points: - if f"{key}_stethoscope_x" not in row: + col_x = f"{key}_stethoscope_x" + col_y = f"{key}_stethoscope_y" + if col_x not in row or col_y not in row: continue - x, y = int(row[f"{key}_stethoscope_x"]), int(row[f"{key}_stethoscope_y"]) + val_x = row[col_x] + val_y = row[col_y] + # NaNであればスキップ + if pd.isna(val_x) or pd.isna(val_y): + continue + + x, y = int(val_x), int(val_y) points[key].append((x, y)) + # 1) 個別 with trajectory image_with_trajectory = body_image.copy() if len(points[key]) > 1: cv2.polylines( @@ -989,7 +931,11 @@ 2, ) cv2.circle( - image_with_trajectory, (x, y), 10, colors.get(key, (0, 0, 255)), -1 + image_with_trajectory, + (x, y), + 10, + colors.get(key, (0, 0, 255)), + -1, ) cv2.imwrite( os.path.join( @@ -998,29 +944,15 @@ image_with_trajectory, ) - if len(points[key]) > 1: - cv2.polylines( - combined_image_with_traj, - [np.array(points[key])], - False, - colors.get(key, (0, 0, 255)), - 2, - ) + # 2) 個別 without trajectory + image_without_trajectory = body_image.copy() cv2.circle( - combined_image_with_traj, (x, y), 10, colors.get(key, (0, 0, 255)), -1 - ) - cv2.circle( - combined_image_without_traj, + image_without_trajectory, (x, y), 10, colors.get(key, (0, 0, 255)), -1, ) - - image_without_trajectory = body_image.copy() - cv2.circle( - image_without_trajectory, (x, y), 10, colors.get(key, (0, 0, 255)), -1 - ) cv2.imwrite( os.path.join( results_dir, @@ -1030,6 +962,39 @@ image_without_trajectory, ) + # 3) combined with trajectory + if len(points[key]) > 1: + cv2.polylines( + combined_image_with_traj, + [np.array(points[key])], + False, + colors.get(key, (0, 0, 255)), + 2, + ) + cv2.circle( + combined_image_with_traj, + (x, y), + 10, + colors.get(key, (0, 0, 255)), + -1, + ) + # 4) combined without trajectory + cv2.circle( + combined_image_without_traj, + (x, y), + 10, + colors.get(key, (0, 0, 255)), + -1, + ) + + # まとめて保存 + os.makedirs( + os.path.join(results_dir, "combined_with_trajectory"), exist_ok=True + ) + os.makedirs( + os.path.join(results_dir, "combined_without_trajectory"), exist_ok=True + ) + cv2.imwrite( os.path.join( results_dir, "combined_with_trajectory", row["image_file_name"] @@ -1043,6 +1008,7 @@ combined_image_without_traj, ) + # 動画化 create_video_from_images( os.path.join(results_dir, "marked_images"), os.path.join(results_dir, "marked_video.mp4"), @@ -1061,6 +1027,11 @@ def create_video_from_images(image_dir, output_path): + """ + 指定ディレクトリ内の PNG 画像を1つの動画に変換する + """ + if not os.path.exists(image_dir): + return images = sorted( [img for img in os.listdir(image_dir) if img.endswith(".png")], key=lambda x: int(re.search(r"(\d+)", x).group()), @@ -1078,7 +1049,8 @@ ) for image in images: - img = cv2.imread(os.path.join(image_dir, image)) + img_path = os.path.join(image_dir, image) + img = cv2.imread(img_path) video.write(img) video.release() @@ -1101,7 +1073,10 @@ det_checkpoint = ( "models/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth" ) - pose_config = "modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py" + pose_config = ( + "modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/" + "rtmpose-l_8xb256-420e_body8-256x192.py" + ) pose_checkpoint = "models/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth" args = parser.parse_args() @@ -1136,7 +1111,6 @@ process_images(args, detector, pose_estimator, visualizer) else: - # RTMPOSE 未使用時 process_images(args, None, None, None) # -------------------------