diff --git a/README.md b/README.md new file mode 100644 index 0000000..19387cb --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +```bash +pip install --upgrade pip setuptools wheel +pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121 +pip install mmcv==2.1.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html +pip install mmdet +pip install mmpose +``` \ No newline at end of file diff --git a/main.py b/main.py index 25240f6..f49a13f 100644 --- a/main.py +++ b/main.py @@ -15,6 +15,15 @@ from modules.EARSForDL.model import RegressionResNet from dotenv import load_dotenv +# New imports for RTMPose +from mmpose.apis import inference_topdown +from mmpose.apis import init_model as init_pose_estimator +from mmpose.evaluation.functional import nms +from mmpose.registry import VISUALIZERS +from mmpose.structures import merge_data_samples, split_instances +from mmpose.utils import adapt_mmdet_pipeline +from mmdet.apis import inference_detector, init_detector + # Load environment variables load_dotenv() @@ -27,6 +36,8 @@ CONV_ENABLED = os.getenv("CONV_ENABLED", "True").lower() == "true" XGBOOST_ENABLED = os.getenv("XGBOOST_ENABLED", "True").lower() == "true" LIGHTGBM_ENABLED = os.getenv("LIGHTGBM_ENABLED", "True").lower() == "true" +POSENET_ENABLED = os.getenv("PoseNet_ENABLED", "True").lower() == "true" +RTMPOSE_ENABLED = os.getenv("RTMPose_ENABLED", "False").lower() == "true" # Get normalization setting NORMALIZE_ENABLED = os.getenv("NORMALIZE_ENABLED", "False").lower() == "true" @@ -89,10 +100,35 @@ return transform(Image.open(image_path).convert("RGB")).unsqueeze(0) -def process_images(base_dir, output_dir): +def extract_keypoints_rtmpose(pose_results): + if not pose_results: + print("No pose results found.") + return None + + max_avg_visible = 0 + best_instance = None + for result in pose_results: + pred_instances = result.pred_instances + for instance in pred_instances: + avg_visible = np.mean(instance.keypoints_visible) + if avg_visible > max_avg_visible: + max_avg_visible = avg_visible + best_instance = instance + + if best_instance is None: + print("No valid instances found.") + return None + + keypoints = best_instance.keypoints[0] + return keypoints + + +def process_images(args, detector, pose_estimator, visualizer): + print("Starting process_images function...") ears_ai = EarsAI() calc_position = CalcStethoscopePosition() - results_dir = os.path.join(output_dir, "results") + base_dir = os.path.join(args.output_dir, "frames") + results_dir = os.path.join(os.path.dirname(args.output_dir), "results") csv_path = os.path.join(results_dir, "results.csv") normalized_csv_path = os.path.join(results_dir, "results-convert.csv") pose_overlay_dir = os.path.join(results_dir, "pose_overlay_image") @@ -106,17 +142,59 @@ [f for f in os.listdir(base_dir) if f.lower().endswith(".png")], key=lambda x: int(re.search(r"(\d+)", x).group(1)), ) + print(f"Found {len(png_files)} PNG files.") rows = [] normalized_rows = [] for image_file_name in png_files: + print(f"Processing image: {image_file_name}") image_path = os.path.join(base_dir, image_file_name) frame = cv2.imread(image_path) if frame is None: print(f"Failed to load image: {image_path}") continue - pose_overlay_img, *landmarks = ears_ai.pose_detect(frame, None) + if POSENET_ENABLED: + pose_overlay_img, *landmarks = ears_ai.pose_detect(frame, None) + left_shoulder = landmarks[0] + right_shoulder = landmarks[1] + left_hip = landmarks[2] + right_hip = landmarks[3] + + elif RTMPOSE_ENABLED: + det_result = inference_detector(detector, frame) + pred_instance = det_result.pred_instances.cpu().numpy() + bboxes = np.concatenate((pred_instance.bboxes, pred_instance.scores[:, None]), axis=1) + bboxes = bboxes[np.logical_and(pred_instance.labels == 0, pred_instance.scores > 0.3)] + bboxes = bboxes[nms(bboxes, 0.3), :4] + pose_results = inference_topdown(pose_estimator, frame, bboxes) + data_samples = merge_data_samples(pose_results) + keypoints = extract_keypoints_rtmpose(pose_results) + if keypoints is None: + print(f"Failed to extract keypoints for image: {image_path}") + continue + left_shoulder = keypoints[5] + right_shoulder = keypoints[6] + left_hip = keypoints[11] + right_hip = keypoints[12] + if visualizer is not None: + visualizer.add_datasample( + "result", + frame, + data_sample=data_samples, + draw_gt=False, + draw_heatmap=False, + draw_bbox=False, + show_kpt_idx=False, + skeleton_style="mmpose", + show=False, + wait_time=0, + kpt_thr=0.3, + ) + pose_overlay_img = visualizer.get_image() + else: + print("No pose estimation method enabled. Please enable either PoseNet or RTMPose.") + continue stethoscope_overlay_img, stethoscope_x, stethoscope_y = ears_ai.ssd_detect(frame, None) cv2.imwrite(os.path.join(pose_overlay_dir, image_file_name), cv2.cvtColor(pose_overlay_img, cv2.COLOR_RGB2BGR)) @@ -127,14 +205,14 @@ row = { "image_file_name": image_file_name, - "left_shoulder_x": landmarks[0][1], - "left_shoulder_y": landmarks[0][0], - "right_shoulder_x": landmarks[1][1], - "right_shoulder_y": landmarks[1][0], - "left_hip_x": landmarks[2][1], - "left_hip_y": landmarks[2][0], - "right_hip_x": landmarks[3][1], - "right_hip_y": landmarks[3][0], + "left_shoulder_x": left_shoulder[1], + "left_shoulder_y": left_shoulder[0], + "right_shoulder_x": right_shoulder[1], + "right_shoulder_y": right_shoulder[0], + "left_hip_x": left_hip[1], + "left_hip_y": left_hip[0], + "right_hip_x": right_hip[1], + "right_hip_y": right_hip[0], "stethoscope_x": stethoscope_x, "stethoscope_y": stethoscope_y, } @@ -166,6 +244,7 @@ normalized_rows.append(normalized_row) if rows: + print(f"Writing {len(rows)} rows to CSV...") fieldnames = list(rows[0].keys()) if CONV_ENABLED: fieldnames.extend(["conv_stethoscope_x", "conv_stethoscope_y"]) @@ -247,16 +326,14 @@ print(f"Processed and saved results to: {csv_path}") print(f"Processed and saved normalized results to: {normalized_csv_path}") - generate_visualizations(csv_path, base_dir, output_dir) + generate_visualizations(csv_path, base_dir, results_dir) else: print("No data to write to CSV.") -def generate_visualizations(csv_path, original_images_dir, output_dir): +def generate_visualizations(csv_path, original_images_dir, results_dir): df = pd.read_csv(csv_path) body_image = cv2.imread("./images/body/BodyF.png") - results_dir = os.path.join(output_dir, "results") - os.makedirs(results_dir, exist_ok=True) dirs = {"marked": "marked_images"} if CONV_ENABLED: @@ -266,7 +343,6 @@ if LIGHTGBM_ENABLED: dirs["lightGBM"] = "lightGBM" - # 必要なディレクトリを作成 os.makedirs(os.path.join(results_dir, "marked_images"), exist_ok=True) for key in dirs: if key != "marked": @@ -282,7 +358,6 @@ print(f"Failed to load image: {os.path.join(original_images_dir, row['image_file_name'])}") continue - # Draw markers on original image for point in ["left_shoulder", "right_shoulder", "left_hip", "right_hip", "stethoscope"]: cv2.circle(original_image, (int(row[f"{point}_x"]), int(row[f"{point}_y"])), 10, (255, 255, 0), -1) @@ -292,7 +367,6 @@ x, y = int(row[f"{key}_stethoscope_x"]), int(row[f"{key}_stethoscope_y"]) points[key].append((x, y)) - # Create image with trajectory image_with_trajectory = body_image.copy() if len(points[key]) > 1: cv2.polylines(image_with_trajectory, [np.array(points[key])], False, colors[key], 2) @@ -301,7 +375,6 @@ os.path.join(results_dir, f"{dirs[key]}_with_trajectory", row["image_file_name"]), image_with_trajectory ) - # Create image without trajectory image_without_trajectory = body_image.copy() cv2.circle(image_without_trajectory, (x, y), 10, colors[key], -1) cv2.imwrite( @@ -309,7 +382,6 @@ image_without_trajectory, ) - # Generate videos create_video_from_images(os.path.join(results_dir, "marked_images"), os.path.join(results_dir, "marked_video.mp4")) for key in dirs: @@ -347,16 +419,36 @@ print(f"Created video: {output_path}") -if __name__ == "__main__": +def main(): + parser = argparse.ArgumentParser(description="Process video and generate results.") parser = argparse.ArgumentParser(description="Process video and generate results.") parser.add_argument("--video_path", default="./video/Test3-1.mp4", help="Path to the input video file") parser.add_argument("--output_dir", default="output", help="Directory to save output images and results") + det_config = "modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py" + det_checkpoint = ( + "https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth" + ) + pose_config = "modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py" + pose_checkpoint = "https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth" args = parser.parse_args() - # Create output directory structure os.makedirs(args.output_dir, exist_ok=True) frames_dir = os.path.join(args.output_dir, "frames") video_to_frames(args.video_path, frames_dir) - process_images(frames_dir, args.output_dir) + + if RTMPOSE_ENABLED: + detector = init_detector(det_config, det_checkpoint, device="cuda:0") + detector.cfg = adapt_mmdet_pipeline(detector.cfg) + pose_estimator = init_pose_estimator(pose_config, pose_checkpoint, device="cuda:0") + visualizer = VISUALIZERS.build(pose_estimator.cfg.visualizer) + visualizer.set_dataset_meta(pose_estimator.dataset_meta, skeleton_style="mmpose") + + process_images(args, detector, pose_estimator, visualizer) + else: + process_images(args, None, None, None) + + +if __name__ == "__main__": + main() diff --git a/modules/rtmpose/configs/_base_/datasets/300vw.py b/modules/rtmpose/configs/_base_/datasets/300vw.py new file mode 100644 index 0000000..5d5ff02 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/300vw.py @@ -0,0 +1,134 @@ +dataset_info = dict( + dataset_name='300vw', + paper_info=dict( + author='Jie Shen, Stefanos Zafeiriou, Grigorios G. Chrysos, ' + 'Jean Kossaifi, Georgios Tzimiropoulos, Maja Pantic', + title='The First Facial Landmark Tracking in-the-Wild Challenge: ' + 'Benchmark and Results', + container='Proceedings of the IEEE ' + 'international conference on computer vision workshops', + year='2016', + homepage='https://ibug.doc.ic.ac.uk/resources/300-VW/', + ), + keypoint_info={ + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-16'), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-15'), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-14'), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-13'), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-12'), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-11'), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-10'), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-9'), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap=''), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-7'), + 10: + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-6'), + 11: + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-5'), + 12: + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-4'), + 13: + dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-3'), + 14: + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-2'), + 15: + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-1'), + 16: + dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap='kpt-0'), + 17: + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-26'), + 18: + dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-25'), + 19: + dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-24'), + 20: + dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-23'), + 21: + dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-22'), + 22: + dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-21'), + 23: + dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-20'), + 24: + dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-19'), + 25: + dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-18'), + 26: + dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-17'), + 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''), + 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap=''), + 29: dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap=''), + 30: dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap=''), + 31: + dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-35'), + 32: + dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-34'), + 33: dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap=''), + 34: + dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-32'), + 35: + dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-31'), + 36: + dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-45'), + 37: + dict(name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-44'), + 38: + dict(name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-43'), + 39: + dict(name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-42'), + 40: + dict(name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-47'), + 41: dict( + name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-46'), + 42: dict( + name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-39'), + 43: dict( + name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-38'), + 44: dict( + name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-37'), + 45: dict( + name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-36'), + 46: dict( + name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-41'), + 47: dict( + name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-40'), + 48: dict( + name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-54'), + 49: dict( + name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-53'), + 50: dict( + name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-52'), + 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''), + 52: dict( + name='kpt-52', id=52, color=[255, 0, 0], type='', swap='kpt-50'), + 53: dict( + name='kpt-53', id=53, color=[255, 0, 0], type='', swap='kpt-49'), + 54: dict( + name='kpt-54', id=54, color=[255, 0, 0], type='', swap='kpt-48'), + 55: dict( + name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-59'), + 56: dict( + name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-58'), + 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''), + 58: dict( + name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-56'), + 59: dict( + name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-55'), + 60: dict( + name='kpt-60', id=60, color=[255, 0, 0], type='', swap='kpt-64'), + 61: dict( + name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-63'), + 62: dict(name='kpt-62', id=62, color=[255, 0, 0], type='', swap=''), + 63: dict( + name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-61'), + 64: dict( + name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-60'), + 65: dict( + name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-67'), + 66: dict(name='kpt-66', id=66, color=[255, 0, 0], type='', swap=''), + 67: dict( + name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-65'), + }, + skeleton_info={}, + joint_weights=[1.] * 68, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/300w.py b/modules/rtmpose/configs/_base_/datasets/300w.py new file mode 100644 index 0000000..2920023 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/300w.py @@ -0,0 +1,134 @@ +dataset_info = dict( + dataset_name='300w', + paper_info=dict( + author='Sagonas, Christos and Antonakos, Epameinondas ' + 'and Tzimiropoulos, Georgios and Zafeiriou, Stefanos ' + 'and Pantic, Maja', + title='300 faces in-the-wild challenge: ' + 'Database and results', + container='Image and vision computing', + year='2016', + homepage='https://ibug.doc.ic.ac.uk/resources/300-W/', + ), + keypoint_info={ + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-16'), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-15'), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-14'), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-13'), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-12'), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-11'), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-10'), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-9'), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap=''), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-7'), + 10: + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-6'), + 11: + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-5'), + 12: + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-4'), + 13: + dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-3'), + 14: + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-2'), + 15: + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-1'), + 16: + dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap='kpt-0'), + 17: + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-26'), + 18: + dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-25'), + 19: + dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-24'), + 20: + dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-23'), + 21: + dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-22'), + 22: + dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-21'), + 23: + dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-20'), + 24: + dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-19'), + 25: + dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-18'), + 26: + dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-17'), + 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''), + 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap=''), + 29: dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap=''), + 30: dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap=''), + 31: + dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-35'), + 32: + dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-34'), + 33: dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap=''), + 34: + dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-32'), + 35: + dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-31'), + 36: + dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-45'), + 37: + dict(name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-44'), + 38: + dict(name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-43'), + 39: + dict(name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-42'), + 40: + dict(name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-47'), + 41: dict( + name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-46'), + 42: dict( + name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-39'), + 43: dict( + name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-38'), + 44: dict( + name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-37'), + 45: dict( + name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-36'), + 46: dict( + name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-41'), + 47: dict( + name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-40'), + 48: dict( + name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-54'), + 49: dict( + name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-53'), + 50: dict( + name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-52'), + 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''), + 52: dict( + name='kpt-52', id=52, color=[255, 0, 0], type='', swap='kpt-50'), + 53: dict( + name='kpt-53', id=53, color=[255, 0, 0], type='', swap='kpt-49'), + 54: dict( + name='kpt-54', id=54, color=[255, 0, 0], type='', swap='kpt-48'), + 55: dict( + name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-59'), + 56: dict( + name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-58'), + 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''), + 58: dict( + name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-56'), + 59: dict( + name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-55'), + 60: dict( + name='kpt-60', id=60, color=[255, 0, 0], type='', swap='kpt-64'), + 61: dict( + name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-63'), + 62: dict(name='kpt-62', id=62, color=[255, 0, 0], type='', swap=''), + 63: dict( + name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-61'), + 64: dict( + name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-60'), + 65: dict( + name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-67'), + 66: dict(name='kpt-66', id=66, color=[255, 0, 0], type='', swap=''), + 67: dict( + name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-65'), + }, + skeleton_info={}, + joint_weights=[1.] * 68, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/300wlp.py b/modules/rtmpose/configs/_base_/datasets/300wlp.py new file mode 100644 index 0000000..b7e2648 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/300wlp.py @@ -0,0 +1,86 @@ +dataset_info = dict( + dataset_name='300wlp', + paper_info=dict( + author='Xiangyu Zhu1, and Zhen Lei1 ' + 'and Xiaoming Liu2, and Hailin Shi1 ' + 'and Stan Z. Li1', + title='300 faces in-the-wild challenge: ' + 'Database and results', + container='Image and vision computing', + year='2016', + homepage='http://www.cbsr.ia.ac.cn/users/xiangyuzhu/' + 'projects/3DDFA/main.htm', + ), + keypoint_info={ + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap=''), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap=''), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap=''), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap=''), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap=''), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap=''), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap=''), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap=''), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap=''), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap=''), + 10: dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap=''), + 11: dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap=''), + 12: dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap=''), + 13: dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap=''), + 14: dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap=''), + 15: dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap=''), + 16: dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''), + 17: dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap=''), + 18: dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap=''), + 19: dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap=''), + 20: dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap=''), + 21: dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap=''), + 22: dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap=''), + 23: dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap=''), + 24: dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap=''), + 25: dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap=''), + 26: dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap=''), + 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''), + 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap=''), + 29: dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap=''), + 30: dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap=''), + 31: dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap=''), + 32: dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap=''), + 33: dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap=''), + 34: dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap=''), + 35: dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap=''), + 36: dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap=''), + 37: dict(name='kpt-37', id=37, color=[255, 0, 0], type='', swap=''), + 38: dict(name='kpt-38', id=38, color=[255, 0, 0], type='', swap=''), + 39: dict(name='kpt-39', id=39, color=[255, 0, 0], type='', swap=''), + 40: dict(name='kpt-40', id=40, color=[255, 0, 0], type='', swap=''), + 41: dict(name='kpt-41', id=41, color=[255, 0, 0], type='', swap=''), + 42: dict(name='kpt-42', id=42, color=[255, 0, 0], type='', swap=''), + 43: dict(name='kpt-43', id=43, color=[255, 0, 0], type='', swap=''), + 44: dict(name='kpt-44', id=44, color=[255, 0, 0], type='', swap=''), + 45: dict(name='kpt-45', id=45, color=[255, 0, 0], type='', swap=''), + 46: dict(name='kpt-46', id=46, color=[255, 0, 0], type='', swap=''), + 47: dict(name='kpt-47', id=47, color=[255, 0, 0], type='', swap=''), + 48: dict(name='kpt-48', id=48, color=[255, 0, 0], type='', swap=''), + 49: dict(name='kpt-49', id=49, color=[255, 0, 0], type='', swap=''), + 50: dict(name='kpt-50', id=50, color=[255, 0, 0], type='', swap=''), + 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''), + 52: dict(name='kpt-52', id=52, color=[255, 0, 0], type='', swap=''), + 53: dict(name='kpt-53', id=53, color=[255, 0, 0], type='', swap=''), + 54: dict(name='kpt-54', id=54, color=[255, 0, 0], type='', swap=''), + 55: dict(name='kpt-55', id=55, color=[255, 0, 0], type='', swap=''), + 56: dict(name='kpt-56', id=56, color=[255, 0, 0], type='', swap=''), + 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''), + 58: dict(name='kpt-58', id=58, color=[255, 0, 0], type='', swap=''), + 59: dict(name='kpt-59', id=59, color=[255, 0, 0], type='', swap=''), + 60: dict(name='kpt-60', id=60, color=[255, 0, 0], type='', swap=''), + 61: dict(name='kpt-61', id=61, color=[255, 0, 0], type='', swap=''), + 62: dict(name='kpt-62', id=62, color=[255, 0, 0], type='', swap=''), + 63: dict(name='kpt-63', id=63, color=[255, 0, 0], type='', swap=''), + 64: dict(name='kpt-64', id=64, color=[255, 0, 0], type='', swap=''), + 65: dict(name='kpt-65', id=65, color=[255, 0, 0], type='', swap=''), + 66: dict(name='kpt-66', id=66, color=[255, 0, 0], type='', swap=''), + 67: dict(name='kpt-67', id=67, color=[255, 0, 0], type='', swap=''), + }, + skeleton_info={}, + joint_weights=[1.] * 68, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/aflw.py b/modules/rtmpose/configs/_base_/datasets/aflw.py new file mode 100644 index 0000000..e092de6 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/aflw.py @@ -0,0 +1,44 @@ +dataset_info = dict( + dataset_name='aflw', + paper_info=dict( + author='Koestinger, Martin and Wohlhart, Paul and ' + 'Roth, Peter M and Bischof, Horst', + title='Annotated facial landmarks in the wild: ' + 'A large-scale, real-world database for facial ' + 'landmark localization', + container='2011 IEEE international conference on computer ' + 'vision workshops (ICCV workshops)', + year='2011', + homepage='https://www.tugraz.at/institute/icg/research/' + 'team-bischof/lrs/downloads/aflw/', + ), + keypoint_info={ + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-5'), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-4'), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-3'), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-2'), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-1'), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-0'), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-11'), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-10'), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-9'), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-8'), + 10: + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-7'), + 11: + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-6'), + 12: + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-14'), + 13: dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap=''), + 14: + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-12'), + 15: + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-17'), + 16: dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''), + 17: + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-15'), + 18: dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='') + }, + skeleton_info={}, + joint_weights=[1.] * 19, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/aic.py b/modules/rtmpose/configs/_base_/datasets/aic.py new file mode 100644 index 0000000..8d30f60 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/aic.py @@ -0,0 +1,140 @@ +dataset_info = dict( + dataset_name='aic', + paper_info=dict( + author='Wu, Jiahong and Zheng, He and Zhao, Bo and ' + 'Li, Yixin and Yan, Baoming and Liang, Rui and ' + 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and ' + 'Fu, Yanwei and others', + title='Ai challenger: A large-scale dataset for going ' + 'deeper in image understanding', + container='arXiv', + year='2017', + homepage='https://github.com/AIChallenger/AI_Challenger_2017', + ), + keypoint_info={ + 0: + dict( + name='right_shoulder', + id=0, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 1: + dict( + name='right_elbow', + id=1, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 2: + dict( + name='right_wrist', + id=2, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 3: + dict( + name='left_shoulder', + id=3, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 4: + dict( + name='left_elbow', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 5: + dict( + name='left_wrist', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 6: + dict( + name='right_hip', + id=6, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 7: + dict( + name='right_knee', + id=7, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 8: + dict( + name='right_ankle', + id=8, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 9: + dict( + name='left_hip', + id=9, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 10: + dict( + name='left_knee', + id=10, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 11: + dict( + name='left_ankle', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 12: + dict( + name='head_top', + id=12, + color=[51, 153, 255], + type='upper', + swap=''), + 13: + dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: + dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]), + 1: dict( + link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]), + 2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]), + 3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]), + 4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]), + 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]), + 7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]), + 8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]), + 9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]), + 10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]), + 11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]), + 12: dict( + link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]), + 13: + dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1. + ], + + # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/' + # 'Evaluation/keypoint_eval/keypoint_eval.py#L50' + # delta = 2 x sigma + sigmas=[ + 0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144, + 0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081, + 0.01291456, 0.01236173 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/ak.py b/modules/rtmpose/configs/_base_/datasets/ak.py new file mode 100644 index 0000000..ddc9e27 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/ak.py @@ -0,0 +1,267 @@ +dataset_info = dict( + dataset_name='Animal Kingdom', + paper_info=dict( + author='Singapore University of Technology and Design, Singapore.' + ' Xun Long Ng, Kian Eng Ong, Qichen Zheng,' + ' Yun Ni, Si Yong Yeo, Jun Liu.', + title='Animal Kingdom: ' + 'A Large and Diverse Dataset for Animal Behavior Understanding', + container='Conference on Computer Vision ' + 'and Pattern Recognition (CVPR)', + year='2022', + homepage='https://sutdcv.github.io/Animal-Kingdom', + version='1.0 (2022-06)', + date_created='2022-06', + ), + keypoint_info={ + 0: + dict( + name='Head_Mid_Top', + id=0, + color=(225, 0, 255), + type='upper', + swap=''), + 1: + dict( + name='Eye_Left', + id=1, + color=[220, 20, 60], + type='upper', + swap='Eye_Right'), + 2: + dict( + name='Eye_Right', + id=2, + color=[0, 255, 255], + type='upper', + swap='Eye_Left'), + 3: + dict( + name='Mouth_Front_Top', + id=3, + color=(0, 255, 42), + type='upper', + swap=''), + 4: + dict( + name='Mouth_Back_Left', + id=4, + color=[221, 160, 221], + type='upper', + swap='Mouth_Back_Right'), + 5: + dict( + name='Mouth_Back_Right', + id=5, + color=[135, 206, 250], + type='upper', + swap='Mouth_Back_Left'), + 6: + dict( + name='Mouth_Front_Bottom', + id=6, + color=[50, 205, 50], + type='upper', + swap=''), + 7: + dict( + name='Shoulder_Left', + id=7, + color=[255, 182, 193], + type='upper', + swap='Shoulder_Right'), + 8: + dict( + name='Shoulder_Right', + id=8, + color=[0, 191, 255], + type='upper', + swap='Shoulder_Left'), + 9: + dict( + name='Elbow_Left', + id=9, + color=[255, 105, 180], + type='upper', + swap='Elbow_Right'), + 10: + dict( + name='Elbow_Right', + id=10, + color=[30, 144, 255], + type='upper', + swap='Elbow_Left'), + 11: + dict( + name='Wrist_Left', + id=11, + color=[255, 20, 147], + type='upper', + swap='Wrist_Right'), + 12: + dict( + name='Wrist_Right', + id=12, + color=[0, 0, 255], + type='upper', + swap='Wrist_Left'), + 13: + dict( + name='Torso_Mid_Back', + id=13, + color=(185, 3, 221), + type='upper', + swap=''), + 14: + dict( + name='Hip_Left', + id=14, + color=[255, 215, 0], + type='lower', + swap='Hip_Right'), + 15: + dict( + name='Hip_Right', + id=15, + color=[147, 112, 219], + type='lower', + swap='Hip_Left'), + 16: + dict( + name='Knee_Left', + id=16, + color=[255, 165, 0], + type='lower', + swap='Knee_Right'), + 17: + dict( + name='Knee_Right', + id=17, + color=[138, 43, 226], + type='lower', + swap='Knee_Left'), + 18: + dict( + name='Ankle_Left', + id=18, + color=[255, 140, 0], + type='lower', + swap='Ankle_Right'), + 19: + dict( + name='Ankle_Right', + id=19, + color=[128, 0, 128], + type='lower', + swap='Ankle_Left'), + 20: + dict( + name='Tail_Top_Back', + id=20, + color=(0, 251, 255), + type='lower', + swap=''), + 21: + dict( + name='Tail_Mid_Back', + id=21, + color=[32, 178, 170], + type='lower', + swap=''), + 22: + dict( + name='Tail_End_Back', + id=22, + color=(0, 102, 102), + type='lower', + swap='') + }, + skeleton_info={ + 0: + dict(link=('Eye_Left', 'Head_Mid_Top'), id=0, color=[220, 20, 60]), + 1: + dict(link=('Eye_Right', 'Head_Mid_Top'), id=1, color=[0, 255, 255]), + 2: + dict( + link=('Mouth_Front_Top', 'Mouth_Back_Left'), + id=2, + color=[221, 160, 221]), + 3: + dict( + link=('Mouth_Front_Top', 'Mouth_Back_Right'), + id=3, + color=[135, 206, 250]), + 4: + dict( + link=('Mouth_Front_Bottom', 'Mouth_Back_Left'), + id=4, + color=[221, 160, 221]), + 5: + dict( + link=('Mouth_Front_Bottom', 'Mouth_Back_Right'), + id=5, + color=[135, 206, 250]), + 6: + dict( + link=('Head_Mid_Top', 'Torso_Mid_Back'), id=6, + color=(225, 0, 255)), + 7: + dict( + link=('Torso_Mid_Back', 'Tail_Top_Back'), + id=7, + color=(185, 3, 221)), + 8: + dict( + link=('Tail_Top_Back', 'Tail_Mid_Back'), id=8, + color=(0, 251, 255)), + 9: + dict( + link=('Tail_Mid_Back', 'Tail_End_Back'), + id=9, + color=[32, 178, 170]), + 10: + dict( + link=('Head_Mid_Top', 'Shoulder_Left'), + id=10, + color=[255, 182, 193]), + 11: + dict( + link=('Head_Mid_Top', 'Shoulder_Right'), + id=11, + color=[0, 191, 255]), + 12: + dict( + link=('Shoulder_Left', 'Elbow_Left'), id=12, color=[255, 105, + 180]), + 13: + dict( + link=('Shoulder_Right', 'Elbow_Right'), + id=13, + color=[30, 144, 255]), + 14: + dict(link=('Elbow_Left', 'Wrist_Left'), id=14, color=[255, 20, 147]), + 15: + dict(link=('Elbow_Right', 'Wrist_Right'), id=15, color=[0, 0, 255]), + 16: + dict(link=('Tail_Top_Back', 'Hip_Left'), id=16, color=[255, 215, 0]), + 17: + dict( + link=('Tail_Top_Back', 'Hip_Right'), id=17, color=[147, 112, 219]), + 18: + dict(link=('Hip_Left', 'Knee_Left'), id=18, color=[255, 165, 0]), + 19: + dict(link=('Hip_Right', 'Knee_Right'), id=19, color=[138, 43, 226]), + 20: + dict(link=('Knee_Left', 'Ankle_Left'), id=20, color=[255, 140, 0]), + 21: + dict(link=('Knee_Right', 'Ankle_Right'), id=21, color=[128, 0, 128]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1. + ], + sigmas=[ + 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, + 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, + 0.025, 0.025, 0.025 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/animalpose.py b/modules/rtmpose/configs/_base_/datasets/animalpose.py new file mode 100644 index 0000000..7f614f7 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/animalpose.py @@ -0,0 +1,166 @@ +dataset_info = dict( + dataset_name='animalpose', + paper_info=dict( + author='Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and ' + 'Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing', + title='Cross-Domain Adaptation for Animal Pose Estimation', + container='The IEEE International Conference on ' + 'Computer Vision (ICCV)', + year='2019', + homepage='https://sites.google.com/view/animal-pose/', + ), + keypoint_info={ + 0: + dict( + name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'), + 1: + dict( + name='R_Eye', + id=1, + color=[255, 128, 0], + type='upper', + swap='L_Eye'), + 2: + dict( + name='L_EarBase', + id=2, + color=[0, 255, 0], + type='upper', + swap='R_EarBase'), + 3: + dict( + name='R_EarBase', + id=3, + color=[255, 128, 0], + type='upper', + swap='L_EarBase'), + 4: + dict(name='Nose', id=4, color=[51, 153, 255], type='upper', swap=''), + 5: + dict(name='Throat', id=5, color=[51, 153, 255], type='upper', swap=''), + 6: + dict( + name='TailBase', id=6, color=[51, 153, 255], type='lower', + swap=''), + 7: + dict( + name='Withers', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict( + name='L_F_Elbow', + id=8, + color=[0, 255, 0], + type='upper', + swap='R_F_Elbow'), + 9: + dict( + name='R_F_Elbow', + id=9, + color=[255, 128, 0], + type='upper', + swap='L_F_Elbow'), + 10: + dict( + name='L_B_Elbow', + id=10, + color=[0, 255, 0], + type='lower', + swap='R_B_Elbow'), + 11: + dict( + name='R_B_Elbow', + id=11, + color=[255, 128, 0], + type='lower', + swap='L_B_Elbow'), + 12: + dict( + name='L_F_Knee', + id=12, + color=[0, 255, 0], + type='upper', + swap='R_F_Knee'), + 13: + dict( + name='R_F_Knee', + id=13, + color=[255, 128, 0], + type='upper', + swap='L_F_Knee'), + 14: + dict( + name='L_B_Knee', + id=14, + color=[0, 255, 0], + type='lower', + swap='R_B_Knee'), + 15: + dict( + name='R_B_Knee', + id=15, + color=[255, 128, 0], + type='lower', + swap='L_B_Knee'), + 16: + dict( + name='L_F_Paw', + id=16, + color=[0, 255, 0], + type='upper', + swap='R_F_Paw'), + 17: + dict( + name='R_F_Paw', + id=17, + color=[255, 128, 0], + type='upper', + swap='L_F_Paw'), + 18: + dict( + name='L_B_Paw', + id=18, + color=[0, 255, 0], + type='lower', + swap='R_B_Paw'), + 19: + dict( + name='R_B_Paw', + id=19, + color=[255, 128, 0], + type='lower', + swap='L_B_Paw') + }, + skeleton_info={ + 0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[51, 153, 255]), + 1: dict(link=('L_Eye', 'L_EarBase'), id=1, color=[0, 255, 0]), + 2: dict(link=('R_Eye', 'R_EarBase'), id=2, color=[255, 128, 0]), + 3: dict(link=('L_Eye', 'Nose'), id=3, color=[0, 255, 0]), + 4: dict(link=('R_Eye', 'Nose'), id=4, color=[255, 128, 0]), + 5: dict(link=('Nose', 'Throat'), id=5, color=[51, 153, 255]), + 6: dict(link=('Throat', 'Withers'), id=6, color=[51, 153, 255]), + 7: dict(link=('TailBase', 'Withers'), id=7, color=[51, 153, 255]), + 8: dict(link=('Throat', 'L_F_Elbow'), id=8, color=[0, 255, 0]), + 9: dict(link=('L_F_Elbow', 'L_F_Knee'), id=9, color=[0, 255, 0]), + 10: dict(link=('L_F_Knee', 'L_F_Paw'), id=10, color=[0, 255, 0]), + 11: dict(link=('Throat', 'R_F_Elbow'), id=11, color=[255, 128, 0]), + 12: dict(link=('R_F_Elbow', 'R_F_Knee'), id=12, color=[255, 128, 0]), + 13: dict(link=('R_F_Knee', 'R_F_Paw'), id=13, color=[255, 128, 0]), + 14: dict(link=('TailBase', 'L_B_Elbow'), id=14, color=[0, 255, 0]), + 15: dict(link=('L_B_Elbow', 'L_B_Knee'), id=15, color=[0, 255, 0]), + 16: dict(link=('L_B_Knee', 'L_B_Paw'), id=16, color=[0, 255, 0]), + 17: dict(link=('TailBase', 'R_B_Elbow'), id=17, color=[255, 128, 0]), + 18: dict(link=('R_B_Elbow', 'R_B_Knee'), id=18, color=[255, 128, 0]), + 19: dict(link=('R_B_Knee', 'R_B_Paw'), id=19, color=[255, 128, 0]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2, + 1.5, 1.5, 1.5, 1.5 + ], + + # Note: The original paper did not provide enough information about + # the sigmas. We modified from 'https://github.com/cocodataset/' + # 'cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py#L523' + sigmas=[ + 0.025, 0.025, 0.026, 0.035, 0.035, 0.10, 0.10, 0.10, 0.107, 0.107, + 0.107, 0.107, 0.087, 0.087, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/ap10k.py b/modules/rtmpose/configs/_base_/datasets/ap10k.py new file mode 100644 index 0000000..aecc173 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/ap10k.py @@ -0,0 +1,142 @@ +dataset_info = dict( + dataset_name='ap10k', + paper_info=dict( + author='Yu, Hang and Xu, Yufei and Zhang, Jing and ' + 'Zhao, Wei and Guan, Ziyu and Tao, Dacheng', + title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild', + container='35th Conference on Neural Information Processing Systems ' + '(NeurIPS 2021) Track on Datasets and Bench-marks.', + year='2021', + homepage='https://github.com/AlexTheBad/AP-10K', + ), + keypoint_info={ + 0: + dict( + name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'), + 1: + dict( + name='R_Eye', + id=1, + color=[255, 128, 0], + type='upper', + swap='L_Eye'), + 2: + dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''), + 3: + dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''), + 4: + dict( + name='Root of tail', + id=4, + color=[51, 153, 255], + type='lower', + swap=''), + 5: + dict( + name='L_Shoulder', + id=5, + color=[51, 153, 255], + type='upper', + swap='R_Shoulder'), + 6: + dict( + name='L_Elbow', + id=6, + color=[51, 153, 255], + type='upper', + swap='R_Elbow'), + 7: + dict( + name='L_F_Paw', + id=7, + color=[0, 255, 0], + type='upper', + swap='R_F_Paw'), + 8: + dict( + name='R_Shoulder', + id=8, + color=[0, 255, 0], + type='upper', + swap='L_Shoulder'), + 9: + dict( + name='R_Elbow', + id=9, + color=[255, 128, 0], + type='upper', + swap='L_Elbow'), + 10: + dict( + name='R_F_Paw', + id=10, + color=[0, 255, 0], + type='lower', + swap='L_F_Paw'), + 11: + dict( + name='L_Hip', + id=11, + color=[255, 128, 0], + type='lower', + swap='R_Hip'), + 12: + dict( + name='L_Knee', + id=12, + color=[255, 128, 0], + type='lower', + swap='R_Knee'), + 13: + dict( + name='L_B_Paw', + id=13, + color=[0, 255, 0], + type='lower', + swap='R_B_Paw'), + 14: + dict( + name='R_Hip', id=14, color=[0, 255, 0], type='lower', + swap='L_Hip'), + 15: + dict( + name='R_Knee', + id=15, + color=[0, 255, 0], + type='lower', + swap='L_Knee'), + 16: + dict( + name='R_B_Paw', + id=16, + color=[0, 255, 0], + type='lower', + swap='L_B_Paw'), + }, + skeleton_info={ + 0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]), + 1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]), + 2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]), + 3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]), + 4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]), + 5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]), + 6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]), + 7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]), + 8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]), + 9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]), + 10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]), + 11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]), + 12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]), + 13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]), + 14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]), + 15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]), + 16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]), + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072, + 0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/atrw.py b/modules/rtmpose/configs/_base_/datasets/atrw.py new file mode 100644 index 0000000..84d3fb3 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/atrw.py @@ -0,0 +1,144 @@ +dataset_info = dict( + dataset_name='atrw', + paper_info=dict( + author='Li, Shuyuan and Li, Jianguo and Tang, Hanlin ' + 'and Qian, Rui and Lin, Weiyao', + title='ATRW: A Benchmark for Amur Tiger ' + 'Re-identification in the Wild', + container='Proceedings of the 28th ACM ' + 'International Conference on Multimedia', + year='2020', + homepage='https://cvwc2019.github.io/challenge.html', + ), + keypoint_info={ + 0: + dict( + name='left_ear', + id=0, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 1: + dict( + name='right_ear', + id=1, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 2: + dict(name='nose', id=2, color=[51, 153, 255], type='upper', swap=''), + 3: + dict( + name='right_shoulder', + id=3, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 4: + dict( + name='right_front_paw', + id=4, + color=[255, 128, 0], + type='upper', + swap='left_front_paw'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='left_front_paw', + id=6, + color=[0, 255, 0], + type='upper', + swap='right_front_paw'), + 7: + dict( + name='right_hip', + id=7, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 8: + dict( + name='right_knee', + id=8, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 9: + dict( + name='right_back_paw', + id=9, + color=[255, 128, 0], + type='lower', + swap='left_back_paw'), + 10: + dict( + name='left_hip', + id=10, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 11: + dict( + name='left_knee', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 12: + dict( + name='left_back_paw', + id=12, + color=[0, 255, 0], + type='lower', + swap='right_back_paw'), + 13: + dict(name='tail', id=13, color=[51, 153, 255], type='lower', swap=''), + 14: + dict( + name='center', id=14, color=[51, 153, 255], type='lower', swap=''), + }, + skeleton_info={ + 0: + dict(link=('left_ear', 'nose'), id=0, color=[51, 153, 255]), + 1: + dict(link=('right_ear', 'nose'), id=1, color=[51, 153, 255]), + 2: + dict(link=('nose', 'center'), id=2, color=[51, 153, 255]), + 3: + dict( + link=('left_shoulder', 'left_front_paw'), id=3, color=[0, 255, 0]), + 4: + dict(link=('left_shoulder', 'center'), id=4, color=[0, 255, 0]), + 5: + dict( + link=('right_shoulder', 'right_front_paw'), + id=5, + color=[255, 128, 0]), + 6: + dict(link=('right_shoulder', 'center'), id=6, color=[255, 128, 0]), + 7: + dict(link=('tail', 'center'), id=7, color=[51, 153, 255]), + 8: + dict(link=('right_back_paw', 'right_knee'), id=8, color=[255, 128, 0]), + 9: + dict(link=('right_knee', 'right_hip'), id=9, color=[255, 128, 0]), + 10: + dict(link=('right_hip', 'tail'), id=10, color=[255, 128, 0]), + 11: + dict(link=('left_back_paw', 'left_knee'), id=11, color=[0, 255, 0]), + 12: + dict(link=('left_knee', 'left_hip'), id=12, color=[0, 255, 0]), + 13: + dict(link=('left_hip', 'tail'), id=13, color=[0, 255, 0]), + }, + joint_weights=[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], + sigmas=[ + 0.0277, 0.0823, 0.0831, 0.0202, 0.0716, 0.0263, 0.0646, 0.0302, 0.0440, + 0.0316, 0.0333, 0.0547, 0.0263, 0.0683, 0.0539 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/campus.py b/modules/rtmpose/configs/_base_/datasets/campus.py new file mode 100644 index 0000000..06cc7ec --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/campus.py @@ -0,0 +1,151 @@ +dataset_info = dict( + dataset_name='campus', + paper_info=dict( + author='Belagiannis, Vasileios and Amin, Sikandar and Andriluka, ' + 'Mykhaylo and Schiele, Bernt and Navab, Nassir and Ilic, Slobodan', + title='3D Pictorial Structures for Multiple Human Pose Estimation', + container='IEEE Computer Society Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2014', + homepage='http://campar.in.tum.de/Chair/MultiHumanPose', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 1: + dict( + name='right_knee', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 2: + dict( + name='right_hip', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 3: + dict( + name='left_hip', + id=3, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 4: + dict( + name='left_knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 5: + dict( + name='left_ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 6: + dict( + name='right_wrist', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 7: + dict( + name='right_elbow', + id=7, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 8: + dict( + name='right_shoulder', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 9: + dict( + name='left_shoulder', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 10: + dict( + name='left_elbow', + id=10, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 11: + dict( + name='left_wrist', + id=11, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 12: + dict( + name='bottom_head', + id=12, + color=[51, 153, 255], + type='upper', + swap=''), + 13: + dict( + name='top_head', + id=13, + color=[51, 153, 255], + type='upper', + swap=''), + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: + dict(link=('left_hip', 'left_knee'), id=2, color=[0, 255, 0]), + 3: + dict(link=('left_knee', 'left_ankle'), id=3, color=[0, 255, 0]), + 4: + dict(link=('right_hip', 'left_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('right_wrist', 'right_elbow'), id=5, color=[255, 128, 0]), + 6: + dict( + link=('right_elbow', 'right_shoulder'), id=6, color=[255, 128, 0]), + 7: + dict(link=('left_shoulder', 'left_elbow'), id=7, color=[0, 255, 0]), + 8: + dict(link=('left_elbow', 'left_wrist'), id=8, color=[0, 255, 0]), + 9: + dict(link=('right_hip', 'right_shoulder'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_hip', 'left_shoulder'), id=10, color=[0, 255, 0]), + 11: + dict( + link=('right_shoulder', 'bottom_head'), id=11, color=[255, 128, + 0]), + 12: + dict(link=('left_shoulder', 'bottom_head'), id=12, color=[0, 255, 0]), + 13: + dict(link=('bottom_head', 'top_head'), id=13, color=[51, 153, 255]), + }, + joint_weights=[ + 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.0, 1.0 + ], + sigmas=[ + 0.089, 0.087, 0.107, 0.107, 0.087, 0.089, 0.062, 0.072, 0.079, 0.079, + 0.072, 0.062, 0.026, 0.026 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/coco.py b/modules/rtmpose/configs/_base_/datasets/coco.py new file mode 100644 index 0000000..787e834 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/coco.py @@ -0,0 +1,181 @@ +dataset_info = dict( + dataset_name='coco', + paper_info=dict( + author='Lin, Tsung-Yi and Maire, Michael and ' + 'Belongie, Serge and Hays, James and ' + 'Perona, Pietro and Ramanan, Deva and ' + r'Doll{\'a}r, Piotr and Zitnick, C Lawrence', + title='Microsoft coco: Common objects in context', + container='European conference on computer vision', + year='2014', + homepage='http://cocodataset.org/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/coco_aic.py b/modules/rtmpose/configs/_base_/datasets/coco_aic.py new file mode 100644 index 0000000..edd636b --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/coco_aic.py @@ -0,0 +1,205 @@ +dataset_info = dict( + dataset_name='coco', + paper_info=[ + dict( + author='Lin, Tsung-Yi and Maire, Michael and ' + 'Belongie, Serge and Hays, James and ' + 'Perona, Pietro and Ramanan, Deva and ' + r'Doll{\'a}r, Piotr and Zitnick, C Lawrence', + title='Microsoft coco: Common objects in context', + container='European conference on computer vision', + year='2014', + homepage='http://cocodataset.org/', + ), + dict( + author='Wu, Jiahong and Zheng, He and Zhao, Bo and ' + 'Li, Yixin and Yan, Baoming and Liang, Rui and ' + 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and ' + 'Fu, Yanwei and others', + title='Ai challenger: A large-scale dataset for going ' + 'deeper in image understanding', + container='arXiv', + year='2017', + homepage='https://github.com/AIChallenger/AI_Challenger_2017', + ), + ], + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='head_top', + id=17, + color=[51, 153, 255], + type='upper', + swap=''), + 18: + dict(name='neck', id=18, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]), + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5, 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.026, 0.026 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/coco_openpose.py b/modules/rtmpose/configs/_base_/datasets/coco_openpose.py new file mode 100644 index 0000000..7bab501 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/coco_openpose.py @@ -0,0 +1,157 @@ +dataset_info = dict( + dataset_name='coco_openpose', + paper_info=dict( + author='Zhe, Cao and Tomas, Simon and ' + 'Shih-En, Wei and Yaser, Sheikh', + title='OpenPose: Realtime Multi-Person 2D Pose ' + 'Estimation using Part Affinity Fields', + container='IEEE Transactions on Pattern Analysis ' + 'and Machine Intelligence', + year='2019', + homepage='https://github.com/CMU-Perceptual-Computing-Lab/openpose/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[255, 0, 0], type='upper', swap=''), + 1: + dict(name='neck', id=1, color=[255, 85, 0], type='upper', swap=''), + 2: + dict( + name='right_shoulder', + id=2, + color=[255, 170, 0], + type='upper', + swap='left_shoulder'), + 3: + dict( + name='right_elbow', + id=3, + color=[255, 255, 0], + type='upper', + swap='left_elbow'), + 4: + dict( + name='right_wrist', + id=4, + color=[170, 255, 0], + type='upper', + swap='left_wrist'), + 5: + dict( + name='left_shoulder', + id=5, + color=[85, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='left_elbow', + id=6, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 7: + dict( + name='left_wrist', + id=7, + color=[0, 255, 85], + type='upper', + swap='right_wrist'), + 8: + dict( + name='right_hip', + id=8, + color=[0, 255, 170], + type='lower', + swap='left_hip'), + 9: + dict( + name='right_knee', + id=9, + color=[0, 255, 255], + type='lower', + swap='left_knee'), + 10: + dict( + name='right_ankle', + id=10, + color=[0, 170, 255], + type='lower', + swap='left_ankle'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 85, 255], + type='lower', + swap='right_hip'), + 12: + dict( + name='left_knee', + id=12, + color=[0, 0, 255], + type='lower', + swap='right_knee'), + 13: + dict( + name='left_ankle', + id=13, + color=[85, 0, 255], + type='lower', + swap='right_ankle'), + 14: + dict( + name='right_eye', + id=14, + color=[170, 0, 255], + type='upper', + swap='left_eye'), + 15: + dict( + name='left_eye', + id=15, + color=[255, 0, 255], + type='upper', + swap='right_eye'), + 16: + dict( + name='right_ear', + id=16, + color=[255, 0, 170], + type='upper', + swap='left_ear'), + 17: + dict( + name='left_ear', + id=17, + color=[255, 0, 85], + type='upper', + swap='right_ear'), + }, + skeleton_info={ + 0: dict(link=('neck', 'right_shoulder'), id=0, color=[255, 0, 0]), + 1: dict(link=('neck', 'left_shoulder'), id=1, color=[255, 85, 0]), + 2: dict( + link=('right_shoulder', 'right_elbow'), id=2, color=[255, 170, 0]), + 3: + dict(link=('right_elbow', 'right_wrist'), id=3, color=[255, 255, 0]), + 4: + dict(link=('left_shoulder', 'left_elbow'), id=4, color=[170, 255, 0]), + 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[85, 255, 0]), + 6: dict(link=('neck', 'right_hip'), id=6, color=[0, 255, 0]), + 7: dict(link=('right_hip', 'right_knee'), id=7, color=[0, 255, 85]), + 8: dict(link=('right_knee', 'right_ankle'), id=8, color=[0, 255, 170]), + 9: dict(link=('neck', 'left_hip'), id=9, color=[0, 255, 225]), + 10: dict(link=('left_hip', 'left_knee'), id=10, color=[0, 170, 255]), + 11: dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 85, 255]), + 12: dict(link=('neck', 'nose'), id=12, color=[0, 0, 255]), + 13: dict(link=('nose', 'right_eye'), id=13, color=[255, 0, 170]), + 14: dict(link=('right_eye', 'right_ear'), id=14, color=[170, 0, 255]), + 15: dict(link=('nose', 'left_eye'), id=15, color=[255, 0, 255]), + 16: dict(link=('left_eye', 'left_ear'), id=16, color=[255, 0, 170]), + }, + joint_weights=[1.] * 18, + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.082 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/coco_wholebody.py b/modules/rtmpose/configs/_base_/datasets/coco_wholebody.py new file mode 100644 index 0000000..a739c97 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/coco_wholebody.py @@ -0,0 +1,1154 @@ +dataset_info = dict( + dataset_name='coco_wholebody', + paper_info=dict( + author='Jin, Sheng and Xu, Lumin and Xu, Jin and ' + 'Wang, Can and Liu, Wentao and ' + 'Qian, Chen and Ouyang, Wanli and Luo, Ping', + title='Whole-Body Human Pose Estimation in the Wild', + container='Proceedings of the European ' + 'Conference on Computer Vision (ECCV)', + year='2020', + homepage='https://github.com/jin-s13/COCO-WholeBody/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='left_big_toe', + id=17, + color=[255, 128, 0], + type='lower', + swap='right_big_toe'), + 18: + dict( + name='left_small_toe', + id=18, + color=[255, 128, 0], + type='lower', + swap='right_small_toe'), + 19: + dict( + name='left_heel', + id=19, + color=[255, 128, 0], + type='lower', + swap='right_heel'), + 20: + dict( + name='right_big_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='left_big_toe'), + 21: + dict( + name='right_small_toe', + id=21, + color=[255, 128, 0], + type='lower', + swap='left_small_toe'), + 22: + dict( + name='right_heel', + id=22, + color=[255, 128, 0], + type='lower', + swap='left_heel'), + 23: + dict( + name='face-0', + id=23, + color=[255, 255, 255], + type='', + swap='face-16'), + 24: + dict( + name='face-1', + id=24, + color=[255, 255, 255], + type='', + swap='face-15'), + 25: + dict( + name='face-2', + id=25, + color=[255, 255, 255], + type='', + swap='face-14'), + 26: + dict( + name='face-3', + id=26, + color=[255, 255, 255], + type='', + swap='face-13'), + 27: + dict( + name='face-4', + id=27, + color=[255, 255, 255], + type='', + swap='face-12'), + 28: + dict( + name='face-5', + id=28, + color=[255, 255, 255], + type='', + swap='face-11'), + 29: + dict( + name='face-6', + id=29, + color=[255, 255, 255], + type='', + swap='face-10'), + 30: + dict( + name='face-7', + id=30, + color=[255, 255, 255], + type='', + swap='face-9'), + 31: + dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''), + 32: + dict( + name='face-9', + id=32, + color=[255, 255, 255], + type='', + swap='face-7'), + 33: + dict( + name='face-10', + id=33, + color=[255, 255, 255], + type='', + swap='face-6'), + 34: + dict( + name='face-11', + id=34, + color=[255, 255, 255], + type='', + swap='face-5'), + 35: + dict( + name='face-12', + id=35, + color=[255, 255, 255], + type='', + swap='face-4'), + 36: + dict( + name='face-13', + id=36, + color=[255, 255, 255], + type='', + swap='face-3'), + 37: + dict( + name='face-14', + id=37, + color=[255, 255, 255], + type='', + swap='face-2'), + 38: + dict( + name='face-15', + id=38, + color=[255, 255, 255], + type='', + swap='face-1'), + 39: + dict( + name='face-16', + id=39, + color=[255, 255, 255], + type='', + swap='face-0'), + 40: + dict( + name='face-17', + id=40, + color=[255, 255, 255], + type='', + swap='face-26'), + 41: + dict( + name='face-18', + id=41, + color=[255, 255, 255], + type='', + swap='face-25'), + 42: + dict( + name='face-19', + id=42, + color=[255, 255, 255], + type='', + swap='face-24'), + 43: + dict( + name='face-20', + id=43, + color=[255, 255, 255], + type='', + swap='face-23'), + 44: + dict( + name='face-21', + id=44, + color=[255, 255, 255], + type='', + swap='face-22'), + 45: + dict( + name='face-22', + id=45, + color=[255, 255, 255], + type='', + swap='face-21'), + 46: + dict( + name='face-23', + id=46, + color=[255, 255, 255], + type='', + swap='face-20'), + 47: + dict( + name='face-24', + id=47, + color=[255, 255, 255], + type='', + swap='face-19'), + 48: + dict( + name='face-25', + id=48, + color=[255, 255, 255], + type='', + swap='face-18'), + 49: + dict( + name='face-26', + id=49, + color=[255, 255, 255], + type='', + swap='face-17'), + 50: + dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''), + 51: + dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''), + 52: + dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''), + 53: + dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''), + 54: + dict( + name='face-31', + id=54, + color=[255, 255, 255], + type='', + swap='face-35'), + 55: + dict( + name='face-32', + id=55, + color=[255, 255, 255], + type='', + swap='face-34'), + 56: + dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''), + 57: + dict( + name='face-34', + id=57, + color=[255, 255, 255], + type='', + swap='face-32'), + 58: + dict( + name='face-35', + id=58, + color=[255, 255, 255], + type='', + swap='face-31'), + 59: + dict( + name='face-36', + id=59, + color=[255, 255, 255], + type='', + swap='face-45'), + 60: + dict( + name='face-37', + id=60, + color=[255, 255, 255], + type='', + swap='face-44'), + 61: + dict( + name='face-38', + id=61, + color=[255, 255, 255], + type='', + swap='face-43'), + 62: + dict( + name='face-39', + id=62, + color=[255, 255, 255], + type='', + swap='face-42'), + 63: + dict( + name='face-40', + id=63, + color=[255, 255, 255], + type='', + swap='face-47'), + 64: + dict( + name='face-41', + id=64, + color=[255, 255, 255], + type='', + swap='face-46'), + 65: + dict( + name='face-42', + id=65, + color=[255, 255, 255], + type='', + swap='face-39'), + 66: + dict( + name='face-43', + id=66, + color=[255, 255, 255], + type='', + swap='face-38'), + 67: + dict( + name='face-44', + id=67, + color=[255, 255, 255], + type='', + swap='face-37'), + 68: + dict( + name='face-45', + id=68, + color=[255, 255, 255], + type='', + swap='face-36'), + 69: + dict( + name='face-46', + id=69, + color=[255, 255, 255], + type='', + swap='face-41'), + 70: + dict( + name='face-47', + id=70, + color=[255, 255, 255], + type='', + swap='face-40'), + 71: + dict( + name='face-48', + id=71, + color=[255, 255, 255], + type='', + swap='face-54'), + 72: + dict( + name='face-49', + id=72, + color=[255, 255, 255], + type='', + swap='face-53'), + 73: + dict( + name='face-50', + id=73, + color=[255, 255, 255], + type='', + swap='face-52'), + 74: + dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''), + 75: + dict( + name='face-52', + id=75, + color=[255, 255, 255], + type='', + swap='face-50'), + 76: + dict( + name='face-53', + id=76, + color=[255, 255, 255], + type='', + swap='face-49'), + 77: + dict( + name='face-54', + id=77, + color=[255, 255, 255], + type='', + swap='face-48'), + 78: + dict( + name='face-55', + id=78, + color=[255, 255, 255], + type='', + swap='face-59'), + 79: + dict( + name='face-56', + id=79, + color=[255, 255, 255], + type='', + swap='face-58'), + 80: + dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''), + 81: + dict( + name='face-58', + id=81, + color=[255, 255, 255], + type='', + swap='face-56'), + 82: + dict( + name='face-59', + id=82, + color=[255, 255, 255], + type='', + swap='face-55'), + 83: + dict( + name='face-60', + id=83, + color=[255, 255, 255], + type='', + swap='face-64'), + 84: + dict( + name='face-61', + id=84, + color=[255, 255, 255], + type='', + swap='face-63'), + 85: + dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''), + 86: + dict( + name='face-63', + id=86, + color=[255, 255, 255], + type='', + swap='face-61'), + 87: + dict( + name='face-64', + id=87, + color=[255, 255, 255], + type='', + swap='face-60'), + 88: + dict( + name='face-65', + id=88, + color=[255, 255, 255], + type='', + swap='face-67'), + 89: + dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''), + 90: + dict( + name='face-67', + id=90, + color=[255, 255, 255], + type='', + swap='face-65'), + 91: + dict( + name='left_hand_root', + id=91, + color=[255, 255, 255], + type='', + swap='right_hand_root'), + 92: + dict( + name='left_thumb1', + id=92, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 93: + dict( + name='left_thumb2', + id=93, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 94: + dict( + name='left_thumb3', + id=94, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 95: + dict( + name='left_thumb4', + id=95, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 96: + dict( + name='left_forefinger1', + id=96, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 97: + dict( + name='left_forefinger2', + id=97, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 98: + dict( + name='left_forefinger3', + id=98, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 99: + dict( + name='left_forefinger4', + id=99, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 100: + dict( + name='left_middle_finger1', + id=100, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 101: + dict( + name='left_middle_finger2', + id=101, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 102: + dict( + name='left_middle_finger3', + id=102, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 103: + dict( + name='left_middle_finger4', + id=103, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 104: + dict( + name='left_ring_finger1', + id=104, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 105: + dict( + name='left_ring_finger2', + id=105, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 106: + dict( + name='left_ring_finger3', + id=106, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 107: + dict( + name='left_ring_finger4', + id=107, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 108: + dict( + name='left_pinky_finger1', + id=108, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 109: + dict( + name='left_pinky_finger2', + id=109, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 110: + dict( + name='left_pinky_finger3', + id=110, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 111: + dict( + name='left_pinky_finger4', + id=111, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 112: + dict( + name='right_hand_root', + id=112, + color=[255, 255, 255], + type='', + swap='left_hand_root'), + 113: + dict( + name='right_thumb1', + id=113, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 114: + dict( + name='right_thumb2', + id=114, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 115: + dict( + name='right_thumb3', + id=115, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 116: + dict( + name='right_thumb4', + id=116, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 117: + dict( + name='right_forefinger1', + id=117, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 118: + dict( + name='right_forefinger2', + id=118, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 119: + dict( + name='right_forefinger3', + id=119, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 120: + dict( + name='right_forefinger4', + id=120, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 121: + dict( + name='right_middle_finger1', + id=121, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 122: + dict( + name='right_middle_finger2', + id=122, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 123: + dict( + name='right_middle_finger3', + id=123, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 124: + dict( + name='right_middle_finger4', + id=124, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 125: + dict( + name='right_ring_finger1', + id=125, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 126: + dict( + name='right_ring_finger2', + id=126, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 127: + dict( + name='right_ring_finger3', + id=127, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 128: + dict( + name='right_ring_finger4', + id=128, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 129: + dict( + name='right_pinky_finger1', + id=129, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 130: + dict( + name='right_pinky_finger2', + id=130, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 131: + dict( + name='right_pinky_finger3', + id=131, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 132: + dict( + name='right_pinky_finger4', + id=132, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]), + 20: + dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]), + 21: + dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]), + 22: + dict( + link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]), + 23: + dict( + link=('right_ankle', 'right_small_toe'), + id=23, + color=[255, 128, 0]), + 24: + dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128, + 0]), + 26: + dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]), + 27: + dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]), + 28: + dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]), + 29: + dict( + link=('left_hand_root', 'left_forefinger1'), + id=29, + color=[255, 153, 255]), + 30: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=30, + color=[255, 153, 255]), + 31: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=31, + color=[255, 153, 255]), + 32: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=32, + color=[255, 153, 255]), + 33: + dict( + link=('left_hand_root', 'left_middle_finger1'), + id=33, + color=[102, 178, 255]), + 34: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=34, + color=[102, 178, 255]), + 35: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=35, + color=[102, 178, 255]), + 36: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=36, + color=[102, 178, 255]), + 37: + dict( + link=('left_hand_root', 'left_ring_finger1'), + id=37, + color=[255, 51, 51]), + 38: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=38, + color=[255, 51, 51]), + 39: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=39, + color=[255, 51, 51]), + 40: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=40, + color=[255, 51, 51]), + 41: + dict( + link=('left_hand_root', 'left_pinky_finger1'), + id=41, + color=[0, 255, 0]), + 42: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=42, + color=[0, 255, 0]), + 43: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=43, + color=[0, 255, 0]), + 44: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=44, + color=[0, 255, 0]), + 45: + dict( + link=('right_hand_root', 'right_thumb1'), + id=45, + color=[255, 128, 0]), + 46: + dict( + link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]), + 47: + dict( + link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]), + 48: + dict( + link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]), + 49: + dict( + link=('right_hand_root', 'right_forefinger1'), + id=49, + color=[255, 153, 255]), + 50: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=50, + color=[255, 153, 255]), + 51: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=51, + color=[255, 153, 255]), + 52: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=52, + color=[255, 153, 255]), + 53: + dict( + link=('right_hand_root', 'right_middle_finger1'), + id=53, + color=[102, 178, 255]), + 54: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=54, + color=[102, 178, 255]), + 55: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=55, + color=[102, 178, 255]), + 56: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=56, + color=[102, 178, 255]), + 57: + dict( + link=('right_hand_root', 'right_ring_finger1'), + id=57, + color=[255, 51, 51]), + 58: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=58, + color=[255, 51, 51]), + 59: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=59, + color=[255, 51, 51]), + 60: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=60, + color=[255, 51, 51]), + 61: + dict( + link=('right_hand_root', 'right_pinky_finger1'), + id=61, + color=[0, 255, 0]), + 62: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=62, + color=[0, 255, 0]), + 63: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=63, + color=[0, 255, 0]), + 64: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=64, + color=[0, 255, 0]) + }, + joint_weights=[1.] * 133, + # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/' + # 'evaluation/myeval_wholebody.py#L175' + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066, + 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, + 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, + 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, + 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, + 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, + 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, + 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, + 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, + 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, + 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, + 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, + 0.019, 0.022, 0.031 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/coco_wholebody_face.py b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_face.py new file mode 100644 index 0000000..e208671 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_face.py @@ -0,0 +1,154 @@ +dataset_info = dict( + dataset_name='coco_wholebody_face', + paper_info=dict( + author='Jin, Sheng and Xu, Lumin and Xu, Jin and ' + 'Wang, Can and Liu, Wentao and ' + 'Qian, Chen and Ouyang, Wanli and Luo, Ping', + title='Whole-Body Human Pose Estimation in the Wild', + container='Proceedings of the European ' + 'Conference on Computer Vision (ECCV)', + year='2020', + homepage='https://github.com/jin-s13/COCO-WholeBody/', + ), + keypoint_info={ + 0: + dict(name='face-0', id=0, color=[255, 0, 0], type='', swap='face-16'), + 1: + dict(name='face-1', id=1, color=[255, 0, 0], type='', swap='face-15'), + 2: + dict(name='face-2', id=2, color=[255, 0, 0], type='', swap='face-14'), + 3: + dict(name='face-3', id=3, color=[255, 0, 0], type='', swap='face-13'), + 4: + dict(name='face-4', id=4, color=[255, 0, 0], type='', swap='face-12'), + 5: + dict(name='face-5', id=5, color=[255, 0, 0], type='', swap='face-11'), + 6: + dict(name='face-6', id=6, color=[255, 0, 0], type='', swap='face-10'), + 7: + dict(name='face-7', id=7, color=[255, 0, 0], type='', swap='face-9'), + 8: dict(name='face-8', id=8, color=[255, 0, 0], type='', swap=''), + 9: + dict(name='face-9', id=9, color=[255, 0, 0], type='', swap='face-7'), + 10: + dict(name='face-10', id=10, color=[255, 0, 0], type='', swap='face-6'), + 11: + dict(name='face-11', id=11, color=[255, 0, 0], type='', swap='face-5'), + 12: + dict(name='face-12', id=12, color=[255, 0, 0], type='', swap='face-4'), + 13: + dict(name='face-13', id=13, color=[255, 0, 0], type='', swap='face-3'), + 14: + dict(name='face-14', id=14, color=[255, 0, 0], type='', swap='face-2'), + 15: + dict(name='face-15', id=15, color=[255, 0, 0], type='', swap='face-1'), + 16: + dict(name='face-16', id=16, color=[255, 0, 0], type='', swap='face-0'), + 17: dict( + name='face-17', id=17, color=[255, 0, 0], type='', swap='face-26'), + 18: dict( + name='face-18', id=18, color=[255, 0, 0], type='', swap='face-25'), + 19: dict( + name='face-19', id=19, color=[255, 0, 0], type='', swap='face-24'), + 20: dict( + name='face-20', id=20, color=[255, 0, 0], type='', swap='face-23'), + 21: dict( + name='face-21', id=21, color=[255, 0, 0], type='', swap='face-22'), + 22: dict( + name='face-22', id=22, color=[255, 0, 0], type='', swap='face-21'), + 23: dict( + name='face-23', id=23, color=[255, 0, 0], type='', swap='face-20'), + 24: dict( + name='face-24', id=24, color=[255, 0, 0], type='', swap='face-19'), + 25: dict( + name='face-25', id=25, color=[255, 0, 0], type='', swap='face-18'), + 26: dict( + name='face-26', id=26, color=[255, 0, 0], type='', swap='face-17'), + 27: dict(name='face-27', id=27, color=[255, 0, 0], type='', swap=''), + 28: dict(name='face-28', id=28, color=[255, 0, 0], type='', swap=''), + 29: dict(name='face-29', id=29, color=[255, 0, 0], type='', swap=''), + 30: dict(name='face-30', id=30, color=[255, 0, 0], type='', swap=''), + 31: dict( + name='face-31', id=31, color=[255, 0, 0], type='', swap='face-35'), + 32: dict( + name='face-32', id=32, color=[255, 0, 0], type='', swap='face-34'), + 33: dict(name='face-33', id=33, color=[255, 0, 0], type='', swap=''), + 34: dict( + name='face-34', id=34, color=[255, 0, 0], type='', swap='face-32'), + 35: dict( + name='face-35', id=35, color=[255, 0, 0], type='', swap='face-31'), + 36: dict( + name='face-36', id=36, color=[255, 0, 0], type='', swap='face-45'), + 37: dict( + name='face-37', id=37, color=[255, 0, 0], type='', swap='face-44'), + 38: dict( + name='face-38', id=38, color=[255, 0, 0], type='', swap='face-43'), + 39: dict( + name='face-39', id=39, color=[255, 0, 0], type='', swap='face-42'), + 40: dict( + name='face-40', id=40, color=[255, 0, 0], type='', swap='face-47'), + 41: dict( + name='face-41', id=41, color=[255, 0, 0], type='', swap='face-46'), + 42: dict( + name='face-42', id=42, color=[255, 0, 0], type='', swap='face-39'), + 43: dict( + name='face-43', id=43, color=[255, 0, 0], type='', swap='face-38'), + 44: dict( + name='face-44', id=44, color=[255, 0, 0], type='', swap='face-37'), + 45: dict( + name='face-45', id=45, color=[255, 0, 0], type='', swap='face-36'), + 46: dict( + name='face-46', id=46, color=[255, 0, 0], type='', swap='face-41'), + 47: dict( + name='face-47', id=47, color=[255, 0, 0], type='', swap='face-40'), + 48: dict( + name='face-48', id=48, color=[255, 0, 0], type='', swap='face-54'), + 49: dict( + name='face-49', id=49, color=[255, 0, 0], type='', swap='face-53'), + 50: dict( + name='face-50', id=50, color=[255, 0, 0], type='', swap='face-52'), + 51: dict(name='face-51', id=52, color=[255, 0, 0], type='', swap=''), + 52: dict( + name='face-52', id=52, color=[255, 0, 0], type='', swap='face-50'), + 53: dict( + name='face-53', id=53, color=[255, 0, 0], type='', swap='face-49'), + 54: dict( + name='face-54', id=54, color=[255, 0, 0], type='', swap='face-48'), + 55: dict( + name='face-55', id=55, color=[255, 0, 0], type='', swap='face-59'), + 56: dict( + name='face-56', id=56, color=[255, 0, 0], type='', swap='face-58'), + 57: dict(name='face-57', id=57, color=[255, 0, 0], type='', swap=''), + 58: dict( + name='face-58', id=58, color=[255, 0, 0], type='', swap='face-56'), + 59: dict( + name='face-59', id=59, color=[255, 0, 0], type='', swap='face-55'), + 60: dict( + name='face-60', id=60, color=[255, 0, 0], type='', swap='face-64'), + 61: dict( + name='face-61', id=61, color=[255, 0, 0], type='', swap='face-63'), + 62: dict(name='face-62', id=62, color=[255, 0, 0], type='', swap=''), + 63: dict( + name='face-63', id=63, color=[255, 0, 0], type='', swap='face-61'), + 64: dict( + name='face-64', id=64, color=[255, 0, 0], type='', swap='face-60'), + 65: dict( + name='face-65', id=65, color=[255, 0, 0], type='', swap='face-67'), + 66: dict(name='face-66', id=66, color=[255, 0, 0], type='', swap=''), + 67: dict( + name='face-67', id=67, color=[255, 0, 0], type='', swap='face-65') + }, + skeleton_info={}, + joint_weights=[1.] * 68, + + # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/' + # 'evaluation/myeval_wholebody.py#L177' + sigmas=[ + 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, 0.025, 0.020, 0.023, + 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, 0.013, 0.012, 0.011, + 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, 0.009, 0.007, 0.007, + 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, 0.011, 0.009, 0.011, + 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, 0.034, 0.008, 0.008, + 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, 0.009, 0.009, 0.007, + 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, 0.008 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/coco_wholebody_hand.py b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_hand.py new file mode 100644 index 0000000..585ed78 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_hand.py @@ -0,0 +1,147 @@ +dataset_info = dict( + dataset_name='coco_wholebody_hand', + paper_info=dict( + author='Jin, Sheng and Xu, Lumin and Xu, Jin and ' + 'Wang, Can and Liu, Wentao and ' + 'Qian, Chen and Ouyang, Wanli and Luo, Ping', + title='Whole-Body Human Pose Estimation in the Wild', + container='Proceedings of the European ' + 'Conference on Computer Vision (ECCV)', + year='2020', + homepage='https://github.com/jin-s13/COCO-WholeBody/', + ), + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger1', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger3', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger4', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[ + 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, 0.018, + 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, 0.022, + 0.031 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/coco_wholebody_openpose.py b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_openpose.py new file mode 100644 index 0000000..315c0fb --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_openpose.py @@ -0,0 +1,1128 @@ +dataset_info = dict( + dataset_name='coco_wholebody_openpose', + paper_info=dict( + author='Jin, Sheng and Xu, Lumin and Xu, Jin and ' + 'Wang, Can and Liu, Wentao and ' + 'Qian, Chen and Ouyang, Wanli and Luo, Ping', + title='Whole-Body Human Pose Estimation in the Wild', + container='Proceedings of the European ' + 'Conference on Computer Vision (ECCV)', + year='2020', + homepage='https://github.com/jin-s13/COCO-WholeBody/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[255, 0, 0], type='upper', swap=''), + 1: + dict(name='neck', id=1, color=[255, 85, 0], type='upper', swap=''), + 2: + dict( + name='right_shoulder', + id=2, + color=[255, 170, 0], + type='upper', + swap='left_shoulder'), + 3: + dict( + name='right_elbow', + id=3, + color=[255, 255, 0], + type='upper', + swap='left_elbow'), + 4: + dict( + name='right_wrist', + id=4, + color=[170, 255, 0], + type='upper', + swap='left_wrist'), + 5: + dict( + name='left_shoulder', + id=5, + color=[85, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='left_elbow', + id=6, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 7: + dict( + name='left_wrist', + id=7, + color=[0, 255, 85], + type='upper', + swap='right_wrist'), + 8: + dict( + name='right_hip', + id=8, + color=[0, 255, 170], + type='lower', + swap='left_hip'), + 9: + dict( + name='right_knee', + id=9, + color=[0, 255, 255], + type='lower', + swap='left_knee'), + 10: + dict( + name='right_ankle', + id=10, + color=[0, 170, 255], + type='lower', + swap='left_ankle'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 85, 255], + type='lower', + swap='right_hip'), + 12: + dict( + name='left_knee', + id=12, + color=[0, 0, 255], + type='lower', + swap='right_knee'), + 13: + dict( + name='left_ankle', + id=13, + color=[85, 0, 255], + type='lower', + swap='right_ankle'), + 14: + dict( + name='right_eye', + id=14, + color=[170, 0, 255], + type='upper', + swap='left_eye'), + 15: + dict( + name='left_eye', + id=15, + color=[255, 0, 255], + type='upper', + swap='right_eye'), + 16: + dict( + name='right_ear', + id=16, + color=[255, 0, 170], + type='upper', + swap='left_ear'), + 17: + dict( + name='left_ear', + id=17, + color=[255, 0, 85], + type='upper', + swap='right_ear'), + 18: + dict( + name='left_big_toe', + id=17, + color=[0, 0, 0], + type='lower', + swap='right_big_toe'), + 19: + dict( + name='left_small_toe', + id=18, + color=[0, 0, 0], + type='lower', + swap='right_small_toe'), + 20: + dict( + name='left_heel', + id=19, + color=[0, 0, 0], + type='lower', + swap='right_heel'), + 21: + dict( + name='right_big_toe', + id=20, + color=[0, 0, 0], + type='lower', + swap='left_big_toe'), + 22: + dict( + name='right_small_toe', + id=21, + color=[0, 0, 0], + type='lower', + swap='left_small_toe'), + 23: + dict( + name='right_heel', + id=22, + color=[0, 0, 0], + type='lower', + swap='left_heel'), + 24: + dict( + name='face-0', + id=23, + color=[255, 255, 255], + type='', + swap='face-16'), + 25: + dict( + name='face-1', + id=24, + color=[255, 255, 255], + type='', + swap='face-15'), + 26: + dict( + name='face-2', + id=25, + color=[255, 255, 255], + type='', + swap='face-14'), + 27: + dict( + name='face-3', + id=26, + color=[255, 255, 255], + type='', + swap='face-13'), + 28: + dict( + name='face-4', + id=27, + color=[255, 255, 255], + type='', + swap='face-12'), + 29: + dict( + name='face-5', + id=28, + color=[255, 255, 255], + type='', + swap='face-11'), + 30: + dict( + name='face-6', + id=29, + color=[255, 255, 255], + type='', + swap='face-10'), + 31: + dict( + name='face-7', + id=30, + color=[255, 255, 255], + type='', + swap='face-9'), + 32: + dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''), + 33: + dict( + name='face-9', + id=32, + color=[255, 255, 255], + type='', + swap='face-7'), + 34: + dict( + name='face-10', + id=33, + color=[255, 255, 255], + type='', + swap='face-6'), + 35: + dict( + name='face-11', + id=34, + color=[255, 255, 255], + type='', + swap='face-5'), + 36: + dict( + name='face-12', + id=35, + color=[255, 255, 255], + type='', + swap='face-4'), + 37: + dict( + name='face-13', + id=36, + color=[255, 255, 255], + type='', + swap='face-3'), + 38: + dict( + name='face-14', + id=37, + color=[255, 255, 255], + type='', + swap='face-2'), + 39: + dict( + name='face-15', + id=38, + color=[255, 255, 255], + type='', + swap='face-1'), + 40: + dict( + name='face-16', + id=39, + color=[255, 255, 255], + type='', + swap='face-0'), + 41: + dict( + name='face-17', + id=40, + color=[255, 255, 255], + type='', + swap='face-26'), + 42: + dict( + name='face-18', + id=41, + color=[255, 255, 255], + type='', + swap='face-25'), + 43: + dict( + name='face-19', + id=42, + color=[255, 255, 255], + type='', + swap='face-24'), + 44: + dict( + name='face-20', + id=43, + color=[255, 255, 255], + type='', + swap='face-23'), + 45: + dict( + name='face-21', + id=44, + color=[255, 255, 255], + type='', + swap='face-22'), + 46: + dict( + name='face-22', + id=45, + color=[255, 255, 255], + type='', + swap='face-21'), + 47: + dict( + name='face-23', + id=46, + color=[255, 255, 255], + type='', + swap='face-20'), + 48: + dict( + name='face-24', + id=47, + color=[255, 255, 255], + type='', + swap='face-19'), + 49: + dict( + name='face-25', + id=48, + color=[255, 255, 255], + type='', + swap='face-18'), + 50: + dict( + name='face-26', + id=49, + color=[255, 255, 255], + type='', + swap='face-17'), + 51: + dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''), + 52: + dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''), + 53: + dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''), + 54: + dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''), + 55: + dict( + name='face-31', + id=54, + color=[255, 255, 255], + type='', + swap='face-35'), + 56: + dict( + name='face-32', + id=55, + color=[255, 255, 255], + type='', + swap='face-34'), + 57: + dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''), + 58: + dict( + name='face-34', + id=57, + color=[255, 255, 255], + type='', + swap='face-32'), + 59: + dict( + name='face-35', + id=58, + color=[255, 255, 255], + type='', + swap='face-31'), + 60: + dict( + name='face-36', + id=59, + color=[255, 255, 255], + type='', + swap='face-45'), + 61: + dict( + name='face-37', + id=60, + color=[255, 255, 255], + type='', + swap='face-44'), + 62: + dict( + name='face-38', + id=61, + color=[255, 255, 255], + type='', + swap='face-43'), + 63: + dict( + name='face-39', + id=62, + color=[255, 255, 255], + type='', + swap='face-42'), + 64: + dict( + name='face-40', + id=63, + color=[255, 255, 255], + type='', + swap='face-47'), + 65: + dict( + name='face-41', + id=64, + color=[255, 255, 255], + type='', + swap='face-46'), + 66: + dict( + name='face-42', + id=65, + color=[255, 255, 255], + type='', + swap='face-39'), + 67: + dict( + name='face-43', + id=66, + color=[255, 255, 255], + type='', + swap='face-38'), + 68: + dict( + name='face-44', + id=67, + color=[255, 255, 255], + type='', + swap='face-37'), + 69: + dict( + name='face-45', + id=68, + color=[255, 255, 255], + type='', + swap='face-36'), + 70: + dict( + name='face-46', + id=69, + color=[255, 255, 255], + type='', + swap='face-41'), + 71: + dict( + name='face-47', + id=70, + color=[255, 255, 255], + type='', + swap='face-40'), + 72: + dict( + name='face-48', + id=71, + color=[255, 255, 255], + type='', + swap='face-54'), + 73: + dict( + name='face-49', + id=72, + color=[255, 255, 255], + type='', + swap='face-53'), + 74: + dict( + name='face-50', + id=73, + color=[255, 255, 255], + type='', + swap='face-52'), + 75: + dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''), + 76: + dict( + name='face-52', + id=75, + color=[255, 255, 255], + type='', + swap='face-50'), + 77: + dict( + name='face-53', + id=76, + color=[255, 255, 255], + type='', + swap='face-49'), + 78: + dict( + name='face-54', + id=77, + color=[255, 255, 255], + type='', + swap='face-48'), + 79: + dict( + name='face-55', + id=78, + color=[255, 255, 255], + type='', + swap='face-59'), + 80: + dict( + name='face-56', + id=79, + color=[255, 255, 255], + type='', + swap='face-58'), + 81: + dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''), + 82: + dict( + name='face-58', + id=81, + color=[255, 255, 255], + type='', + swap='face-56'), + 83: + dict( + name='face-59', + id=82, + color=[255, 255, 255], + type='', + swap='face-55'), + 84: + dict( + name='face-60', + id=83, + color=[255, 255, 255], + type='', + swap='face-64'), + 85: + dict( + name='face-61', + id=84, + color=[255, 255, 255], + type='', + swap='face-63'), + 86: + dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''), + 87: + dict( + name='face-63', + id=86, + color=[255, 255, 255], + type='', + swap='face-61'), + 88: + dict( + name='face-64', + id=87, + color=[255, 255, 255], + type='', + swap='face-60'), + 89: + dict( + name='face-65', + id=88, + color=[255, 255, 255], + type='', + swap='face-67'), + 90: + dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''), + 91: + dict( + name='face-67', + id=90, + color=[255, 255, 255], + type='', + swap='face-65'), + 92: + dict( + name='left_hand_root', + id=92, + color=[0, 0, 255], + type='', + swap='right_hand_root'), + 93: + dict( + name='left_thumb1', + id=93, + color=[0, 0, 255], + type='', + swap='right_thumb1'), + 94: + dict( + name='left_thumb2', + id=94, + color=[0, 0, 255], + type='', + swap='right_thumb2'), + 95: + dict( + name='left_thumb3', + id=95, + color=[0, 0, 255], + type='', + swap='right_thumb3'), + 96: + dict( + name='left_thumb4', + id=96, + color=[0, 0, 255], + type='', + swap='right_thumb4'), + 97: + dict( + name='left_forefinger1', + id=97, + color=[0, 0, 255], + type='', + swap='right_forefinger1'), + 98: + dict( + name='left_forefinger2', + id=98, + color=[0, 0, 255], + type='', + swap='right_forefinger2'), + 99: + dict( + name='left_forefinger3', + id=99, + color=[0, 0, 255], + type='', + swap='right_forefinger3'), + 100: + dict( + name='left_forefinger4', + id=100, + color=[0, 0, 255], + type='', + swap='right_forefinger4'), + 101: + dict( + name='left_middle_finger1', + id=101, + color=[0, 0, 255], + type='', + swap='right_middle_finger1'), + 102: + dict( + name='left_middle_finger2', + id=102, + color=[0, 0, 255], + type='', + swap='right_middle_finger2'), + 103: + dict( + name='left_middle_finger3', + id=103, + color=[0, 0, 255], + type='', + swap='right_middle_finger3'), + 104: + dict( + name='left_middle_finger4', + id=104, + color=[0, 0, 255], + type='', + swap='right_middle_finger4'), + 105: + dict( + name='left_ring_finger1', + id=105, + color=[0, 0, 255], + type='', + swap='right_ring_finger1'), + 106: + dict( + name='left_ring_finger2', + id=106, + color=[0, 0, 255], + type='', + swap='right_ring_finger2'), + 107: + dict( + name='left_ring_finger3', + id=107, + color=[0, 0, 255], + type='', + swap='right_ring_finger3'), + 108: + dict( + name='left_ring_finger4', + id=108, + color=[0, 0, 255], + type='', + swap='right_ring_finger4'), + 109: + dict( + name='left_pinky_finger1', + id=109, + color=[0, 0, 255], + type='', + swap='right_pinky_finger1'), + 110: + dict( + name='left_pinky_finger2', + id=110, + color=[0, 0, 255], + type='', + swap='right_pinky_finger2'), + 111: + dict( + name='left_pinky_finger3', + id=111, + color=[0, 0, 255], + type='', + swap='right_pinky_finger3'), + 112: + dict( + name='left_pinky_finger4', + id=112, + color=[0, 0, 255], + type='', + swap='right_pinky_finger4'), + 113: + dict( + name='right_hand_root', + id=113, + color=[0, 0, 255], + type='', + swap='left_hand_root'), + 114: + dict( + name='right_thumb1', + id=114, + color=[0, 0, 255], + type='', + swap='left_thumb1'), + 115: + dict( + name='right_thumb2', + id=115, + color=[0, 0, 255], + type='', + swap='left_thumb2'), + 116: + dict( + name='right_thumb3', + id=116, + color=[0, 0, 255], + type='', + swap='left_thumb3'), + 117: + dict( + name='right_thumb4', + id=117, + color=[0, 0, 255], + type='', + swap='left_thumb4'), + 118: + dict( + name='right_forefinger1', + id=118, + color=[0, 0, 255], + type='', + swap='left_forefinger1'), + 119: + dict( + name='right_forefinger2', + id=119, + color=[0, 0, 255], + type='', + swap='left_forefinger2'), + 120: + dict( + name='right_forefinger3', + id=120, + color=[0, 0, 255], + type='', + swap='left_forefinger3'), + 121: + dict( + name='right_forefinger4', + id=121, + color=[0, 0, 255], + type='', + swap='left_forefinger4'), + 122: + dict( + name='right_middle_finger1', + id=122, + color=[0, 0, 255], + type='', + swap='left_middle_finger1'), + 123: + dict( + name='right_middle_finger2', + id=123, + color=[0, 0, 255], + type='', + swap='left_middle_finger2'), + 124: + dict( + name='right_middle_finger3', + id=124, + color=[0, 0, 255], + type='', + swap='left_middle_finger3'), + 125: + dict( + name='right_middle_finger4', + id=125, + color=[0, 0, 255], + type='', + swap='left_middle_finger4'), + 126: + dict( + name='right_ring_finger1', + id=126, + color=[0, 0, 255], + type='', + swap='left_ring_finger1'), + 127: + dict( + name='right_ring_finger2', + id=127, + color=[0, 0, 255], + type='', + swap='left_ring_finger2'), + 128: + dict( + name='right_ring_finger3', + id=128, + color=[0, 0, 255], + type='', + swap='left_ring_finger3'), + 129: + dict( + name='right_ring_finger4', + id=129, + color=[0, 0, 255], + type='', + swap='left_ring_finger4'), + 130: + dict( + name='right_pinky_finger1', + id=130, + color=[0, 0, 255], + type='', + swap='left_pinky_finger1'), + 131: + dict( + name='right_pinky_finger2', + id=131, + color=[0, 0, 255], + type='', + swap='left_pinky_finger2'), + 132: + dict( + name='right_pinky_finger3', + id=132, + color=[0, 0, 255], + type='', + swap='left_pinky_finger3'), + 133: + dict( + name='right_pinky_finger4', + id=133, + color=[0, 0, 255], + type='', + swap='left_pinky_finger4') + }, + skeleton_info={ + 0: + dict(link=('neck', 'right_shoulder'), id=0, color=[255, 0, 0]), + 1: + dict(link=('neck', 'left_shoulder'), id=1, color=[255, 85, 0]), + 2: + dict( + link=('right_shoulder', 'right_elbow'), id=2, color=[255, 170, 0]), + 3: + dict(link=('right_elbow', 'right_wrist'), id=3, color=[255, 255, 0]), + 4: + dict(link=('left_shoulder', 'left_elbow'), id=4, color=[170, 255, 0]), + 5: + dict(link=('left_elbow', 'left_wrist'), id=5, color=[85, 255, 0]), + 6: + dict(link=('neck', 'right_hip'), id=6, color=[0, 255, 0]), + 7: + dict(link=('right_hip', 'right_knee'), id=7, color=[0, 255, 85]), + 8: + dict(link=('right_knee', 'right_ankle'), id=8, color=[0, 255, 170]), + 9: + dict(link=('neck', 'left_hip'), id=9, color=[0, 255, 225]), + 10: + dict(link=('left_hip', 'left_knee'), id=10, color=[0, 170, 255]), + 11: + dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 85, 255]), + 12: + dict(link=('neck', 'nose'), id=12, color=[0, 0, 255]), + 13: + dict(link=('nose', 'right_eye'), id=13, color=[255, 0, 170]), + 14: + dict(link=('right_eye', 'right_ear'), id=14, color=[170, 0, 255]), + 15: + dict(link=('nose', 'left_eye'), id=15, color=[255, 0, 255]), + 16: + dict(link=('left_eye', 'left_ear'), id=16, color=[255, 0, 170]), + 17: + dict(link=('left_hand_root', 'left_thumb1'), id=17, color=[255, 0, 0]), + 18: + dict(link=('left_thumb1', 'left_thumb2'), id=18, color=[255, 76, 0]), + 19: + dict(link=('left_thumb2', 'left_thumb3'), id=19, color=[255, 153, 0]), + 20: + dict(link=('left_thumb3', 'left_thumb4'), id=20, color=[255, 230, 0]), + 21: + dict( + link=('left_hand_root', 'left_forefinger1'), + id=21, + color=[204, 255, 0]), + 22: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=22, + color=[128, 255, 0]), + 23: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=23, + color=[51, 255, 0]), + 24: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=24, + color=[0, 255, 26]), + 25: + dict( + link=('left_hand_root', 'left_middle_finger1'), + id=25, + color=[0, 255, 102]), + 26: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=26, + color=[0, 255, 178]), + 27: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=27, + color=[0, 255, 255]), + 28: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=28, + color=[0, 178, 255]), + 29: + dict( + link=('left_hand_root', 'left_ring_finger1'), + id=29, + color=[0, 102, 255]), + 30: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=30, + color=[0, 26, 255]), + 31: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=31, + color=[51, 0, 255]), + 32: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=32, + color=[128, 0, 255]), + 33: + dict( + link=('left_hand_root', 'left_pinky_finger1'), + id=33, + color=[204, 0, 255]), + 34: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=34, + color=[255, 0, 230]), + 35: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=35, + color=[255, 0, 153]), + 36: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=36, + color=[255, 0, 76]), + 37: + dict( + link=('right_hand_root', 'right_thumb1'), id=37, color=[255, 0, + 0]), + 38: + dict(link=('right_thumb1', 'right_thumb2'), id=38, color=[255, 76, 0]), + 39: + dict( + link=('right_thumb2', 'right_thumb3'), id=39, color=[255, 153, 0]), + 40: + dict( + link=('right_thumb3', 'right_thumb4'), id=40, color=[255, 230, 0]), + 41: + dict( + link=('right_hand_root', 'right_forefinger1'), + id=41, + color=[204, 255, 0]), + 42: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=42, + color=[128, 255, 0]), + 43: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=43, + color=[51, 255, 0]), + 44: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=44, + color=[0, 255, 26]), + 45: + dict( + link=('right_hand_root', 'right_middle_finger1'), + id=45, + color=[0, 255, 102]), + 46: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=46, + color=[0, 255, 178]), + 47: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=47, + color=[255, 255, 255]), + 48: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=48, + color=[0, 178, 255]), + 49: + dict( + link=('right_hand_root', 'right_ring_finger1'), + id=49, + color=[0, 102, 255]), + 50: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=50, + color=[0, 26, 255]), + 51: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=51, + color=[51, 0, 255]), + 52: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=52, + color=[128, 0, 255]), + 53: + dict( + link=('right_hand_root', 'right_pinky_finger1'), + id=53, + color=[204, 0, 255]), + 54: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=54, + color=[255, 0, 230]), + 55: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=55, + color=[255, 0, 153]), + 56: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=56, + color=[255, 0, 76]) + }, + joint_weights=[1.] * 134, + # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/' + # 'evaluation/myeval_wholebody.py#L175' + sigmas=[ + 0.026, 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, + 0.062, 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, + 0.066, 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, + 0.031, 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, + 0.045, 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, + 0.015, 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, + 0.017, 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, + 0.010, 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, + 0.009, 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, + 0.01, 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, + 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, + 0.019, 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, + 0.024, 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, + 0.02, 0.019, 0.022, 0.031 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/cofw.py b/modules/rtmpose/configs/_base_/datasets/cofw.py new file mode 100644 index 0000000..8879254 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/cofw.py @@ -0,0 +1,57 @@ +dataset_info = dict( + dataset_name='cofw', + paper_info=dict( + author='Burgos-Artizzu, Xavier P and Perona, ' + r'Pietro and Doll{\'a}r, Piotr', + title='Robust face landmark estimation under occlusion', + container='Proceedings of the IEEE international ' + 'conference on computer vision', + year='2013', + homepage='http://www.vision.caltech.edu/xpburgos/ICCV13/', + ), + keypoint_info={ + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-1'), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-0'), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-3'), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-2'), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-6'), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-7'), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-4'), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-5'), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-9'), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-8'), + 10: + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-11'), + 11: + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-10'), + 12: + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-14'), + 13: + dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-15'), + 14: + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-12'), + 15: + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-13'), + 16: + dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap='kpt-17'), + 17: + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-16'), + 18: + dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-19'), + 19: + dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-18'), + 20: dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap=''), + 21: dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap=''), + 22: + dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-23'), + 23: + dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-22'), + 24: dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap=''), + 25: dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap=''), + 26: dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap=''), + 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''), + 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap='') + }, + skeleton_info={}, + joint_weights=[1.] * 29, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/crowdpose.py b/modules/rtmpose/configs/_base_/datasets/crowdpose.py new file mode 100644 index 0000000..358d36f --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/crowdpose.py @@ -0,0 +1,147 @@ +dataset_info = dict( + dataset_name='crowdpose', + paper_info=dict( + author='Li, Jiefeng and Wang, Can and Zhu, Hao and ' + 'Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu', + title='CrowdPose: Efficient Crowded Scenes Pose Estimation ' + 'and A New Benchmark', + container='Proceedings of IEEE Conference on Computer ' + 'Vision and Pattern Recognition (CVPR)', + year='2019', + homepage='https://github.com/Jeff-sjtu/CrowdPose', + ), + keypoint_info={ + 0: + dict( + name='left_shoulder', + id=0, + color=[51, 153, 255], + type='upper', + swap='right_shoulder'), + 1: + dict( + name='right_shoulder', + id=1, + color=[51, 153, 255], + type='upper', + swap='left_shoulder'), + 2: + dict( + name='left_elbow', + id=2, + color=[51, 153, 255], + type='upper', + swap='right_elbow'), + 3: + dict( + name='right_elbow', + id=3, + color=[51, 153, 255], + type='upper', + swap='left_elbow'), + 4: + dict( + name='left_wrist', + id=4, + color=[51, 153, 255], + type='upper', + swap='right_wrist'), + 5: + dict( + name='right_wrist', + id=5, + color=[0, 255, 0], + type='upper', + swap='left_wrist'), + 6: + dict( + name='left_hip', + id=6, + color=[255, 128, 0], + type='lower', + swap='right_hip'), + 7: + dict( + name='right_hip', + id=7, + color=[0, 255, 0], + type='lower', + swap='left_hip'), + 8: + dict( + name='left_knee', + id=8, + color=[255, 128, 0], + type='lower', + swap='right_knee'), + 9: + dict( + name='right_knee', + id=9, + color=[0, 255, 0], + type='lower', + swap='left_knee'), + 10: + dict( + name='left_ankle', + id=10, + color=[255, 128, 0], + type='lower', + swap='right_ankle'), + 11: + dict( + name='right_ankle', + id=11, + color=[0, 255, 0], + type='lower', + swap='left_ankle'), + 12: + dict( + name='top_head', id=12, color=[255, 128, 0], type='upper', + swap=''), + 13: + dict(name='neck', id=13, color=[0, 255, 0], type='upper', swap='') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('top_head', 'neck'), id=12, color=[51, 153, 255]), + 13: + dict(link=('right_shoulder', 'neck'), id=13, color=[51, 153, 255]), + 14: + dict(link=('left_shoulder', 'neck'), id=14, color=[51, 153, 255]) + }, + joint_weights=[ + 0.2, 0.2, 0.2, 1.3, 1.5, 0.2, 1.3, 1.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.5 + ], + sigmas=[ + 0.079, 0.079, 0.072, 0.072, 0.062, 0.062, 0.107, 0.107, 0.087, 0.087, + 0.089, 0.089, 0.079, 0.079 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/deepfashion2.py b/modules/rtmpose/configs/_base_/datasets/deepfashion2.py new file mode 100644 index 0000000..de7004e --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/deepfashion2.py @@ -0,0 +1,2660 @@ +colors = dict( + sss=[255, 128, 0], # short_sleeve_shirt + lss=[255, 0, 128], # long_sleeved_shirt + sso=[128, 0, 255], # short_sleeved_outwear + lso=[0, 128, 255], # long_sleeved_outwear + vest=[0, 128, 128], # vest + sling=[0, 0, 128], # sling + shorts=[128, 128, 128], # shorts + trousers=[128, 0, 128], # trousers + skirt=[64, 128, 128], # skirt + ssd=[64, 64, 128], # short_sleeved_dress + lsd=[128, 64, 0], # long_sleeved_dress + vd=[128, 64, 255], # vest_dress + sd=[128, 64, 0], # sling_dress +) +dataset_info = dict( + dataset_name='deepfashion2', + paper_info=dict( + author='Yuying Ge and Ruimao Zhang and Lingyun Wu ' + 'and Xiaogang Wang and Xiaoou Tang and Ping Luo', + title='DeepFashion2: A Versatile Benchmark for ' + 'Detection, Pose Estimation, Segmentation and ' + 'Re-Identification of Clothing Images', + container='Proceedings of IEEE Conference on Computer ' + 'Vision and Pattern Recognition (CVPR)', + year='2019', + homepage='https://github.com/switchablenorms/DeepFashion2', + ), + keypoint_info={ + # short_sleeved_shirt + 0: + dict(name='sss_kpt1', id=0, color=colors['sss'], type='', swap=''), + 1: + dict( + name='sss_kpt2', + id=1, + color=colors['sss'], + type='', + swap='sss_kpt6'), + 2: + dict( + name='sss_kpt3', + id=2, + color=colors['sss'], + type='', + swap='sss_kpt5'), + 3: + dict(name='sss_kpt4', id=3, color=colors['sss'], type='', swap=''), + 4: + dict( + name='sss_kpt5', + id=4, + color=colors['sss'], + type='', + swap='sss_kpt3'), + 5: + dict( + name='sss_kpt6', + id=5, + color=colors['sss'], + type='', + swap='sss_kpt2'), + 6: + dict( + name='sss_kpt7', + id=6, + color=colors['sss'], + type='', + swap='sss_kpt25'), + 7: + dict( + name='sss_kpt8', + id=7, + color=colors['sss'], + type='', + swap='sss_kpt24'), + 8: + dict( + name='sss_kpt9', + id=8, + color=colors['sss'], + type='', + swap='sss_kpt23'), + 9: + dict( + name='sss_kpt10', + id=9, + color=colors['sss'], + type='', + swap='sss_kpt22'), + 10: + dict( + name='sss_kpt11', + id=10, + color=colors['sss'], + type='', + swap='sss_kpt21'), + 11: + dict( + name='sss_kpt12', + id=11, + color=colors['sss'], + type='', + swap='sss_kpt20'), + 12: + dict( + name='sss_kpt13', + id=12, + color=colors['sss'], + type='', + swap='sss_kpt19'), + 13: + dict( + name='sss_kpt14', + id=13, + color=colors['sss'], + type='', + swap='sss_kpt18'), + 14: + dict( + name='sss_kpt15', + id=14, + color=colors['sss'], + type='', + swap='sss_kpt17'), + 15: + dict(name='sss_kpt16', id=15, color=colors['sss'], type='', swap=''), + 16: + dict( + name='sss_kpt17', + id=16, + color=colors['sss'], + type='', + swap='sss_kpt15'), + 17: + dict( + name='sss_kpt18', + id=17, + color=colors['sss'], + type='', + swap='sss_kpt14'), + 18: + dict( + name='sss_kpt19', + id=18, + color=colors['sss'], + type='', + swap='sss_kpt13'), + 19: + dict( + name='sss_kpt20', + id=19, + color=colors['sss'], + type='', + swap='sss_kpt12'), + 20: + dict( + name='sss_kpt21', + id=20, + color=colors['sss'], + type='', + swap='sss_kpt11'), + 21: + dict( + name='sss_kpt22', + id=21, + color=colors['sss'], + type='', + swap='sss_kpt10'), + 22: + dict( + name='sss_kpt23', + id=22, + color=colors['sss'], + type='', + swap='sss_kpt9'), + 23: + dict( + name='sss_kpt24', + id=23, + color=colors['sss'], + type='', + swap='sss_kpt8'), + 24: + dict( + name='sss_kpt25', + id=24, + color=colors['sss'], + type='', + swap='sss_kpt7'), + # long_sleeved_shirt + 25: + dict(name='lss_kpt1', id=25, color=colors['lss'], type='', swap=''), + 26: + dict( + name='lss_kpt2', + id=26, + color=colors['lss'], + type='', + swap='lss_kpt6'), + 27: + dict( + name='lss_kpt3', + id=27, + color=colors['lss'], + type='', + swap='lss_kpt5'), + 28: + dict(name='lss_kpt4', id=28, color=colors['lss'], type='', swap=''), + 29: + dict( + name='lss_kpt5', + id=29, + color=colors['lss'], + type='', + swap='lss_kpt3'), + 30: + dict( + name='lss_kpt6', + id=30, + color=colors['lss'], + type='', + swap='lss_kpt2'), + 31: + dict( + name='lss_kpt7', + id=31, + color=colors['lss'], + type='', + swap='lss_kpt33'), + 32: + dict( + name='lss_kpt8', + id=32, + color=colors['lss'], + type='', + swap='lss_kpt32'), + 33: + dict( + name='lss_kpt9', + id=33, + color=colors['lss'], + type='', + swap='lss_kpt31'), + 34: + dict( + name='lss_kpt10', + id=34, + color=colors['lss'], + type='', + swap='lss_kpt30'), + 35: + dict( + name='lss_kpt11', + id=35, + color=colors['lss'], + type='', + swap='lss_kpt29'), + 36: + dict( + name='lss_kpt12', + id=36, + color=colors['lss'], + type='', + swap='lss_kpt28'), + 37: + dict( + name='lss_kpt13', + id=37, + color=colors['lss'], + type='', + swap='lss_kpt27'), + 38: + dict( + name='lss_kpt14', + id=38, + color=colors['lss'], + type='', + swap='lss_kpt26'), + 39: + dict( + name='lss_kpt15', + id=39, + color=colors['lss'], + type='', + swap='lss_kpt25'), + 40: + dict( + name='lss_kpt16', + id=40, + color=colors['lss'], + type='', + swap='lss_kpt24'), + 41: + dict( + name='lss_kpt17', + id=41, + color=colors['lss'], + type='', + swap='lss_kpt23'), + 42: + dict( + name='lss_kpt18', + id=42, + color=colors['lss'], + type='', + swap='lss_kpt22'), + 43: + dict( + name='lss_kpt19', + id=43, + color=colors['lss'], + type='', + swap='lss_kpt21'), + 44: + dict(name='lss_kpt20', id=44, color=colors['lss'], type='', swap=''), + 45: + dict( + name='lss_kpt21', + id=45, + color=colors['lss'], + type='', + swap='lss_kpt19'), + 46: + dict( + name='lss_kpt22', + id=46, + color=colors['lss'], + type='', + swap='lss_kpt18'), + 47: + dict( + name='lss_kpt23', + id=47, + color=colors['lss'], + type='', + swap='lss_kpt17'), + 48: + dict( + name='lss_kpt24', + id=48, + color=colors['lss'], + type='', + swap='lss_kpt16'), + 49: + dict( + name='lss_kpt25', + id=49, + color=colors['lss'], + type='', + swap='lss_kpt15'), + 50: + dict( + name='lss_kpt26', + id=50, + color=colors['lss'], + type='', + swap='lss_kpt14'), + 51: + dict( + name='lss_kpt27', + id=51, + color=colors['lss'], + type='', + swap='lss_kpt13'), + 52: + dict( + name='lss_kpt28', + id=52, + color=colors['lss'], + type='', + swap='lss_kpt12'), + 53: + dict( + name='lss_kpt29', + id=53, + color=colors['lss'], + type='', + swap='lss_kpt11'), + 54: + dict( + name='lss_kpt30', + id=54, + color=colors['lss'], + type='', + swap='lss_kpt10'), + 55: + dict( + name='lss_kpt31', + id=55, + color=colors['lss'], + type='', + swap='lss_kpt9'), + 56: + dict( + name='lss_kpt32', + id=56, + color=colors['lss'], + type='', + swap='lss_kpt8'), + 57: + dict( + name='lss_kpt33', + id=57, + color=colors['lss'], + type='', + swap='lss_kpt7'), + # short_sleeved_outwear + 58: + dict(name='sso_kpt1', id=58, color=colors['sso'], type='', swap=''), + 59: + dict( + name='sso_kpt2', + id=59, + color=colors['sso'], + type='', + swap='sso_kpt26'), + 60: + dict( + name='sso_kpt3', + id=60, + color=colors['sso'], + type='', + swap='sso_kpt5'), + 61: + dict( + name='sso_kpt4', + id=61, + color=colors['sso'], + type='', + swap='sso_kpt6'), + 62: + dict( + name='sso_kpt5', + id=62, + color=colors['sso'], + type='', + swap='sso_kpt3'), + 63: + dict( + name='sso_kpt6', + id=63, + color=colors['sso'], + type='', + swap='sso_kpt4'), + 64: + dict( + name='sso_kpt7', + id=64, + color=colors['sso'], + type='', + swap='sso_kpt25'), + 65: + dict( + name='sso_kpt8', + id=65, + color=colors['sso'], + type='', + swap='sso_kpt24'), + 66: + dict( + name='sso_kpt9', + id=66, + color=colors['sso'], + type='', + swap='sso_kpt23'), + 67: + dict( + name='sso_kpt10', + id=67, + color=colors['sso'], + type='', + swap='sso_kpt22'), + 68: + dict( + name='sso_kpt11', + id=68, + color=colors['sso'], + type='', + swap='sso_kpt21'), + 69: + dict( + name='sso_kpt12', + id=69, + color=colors['sso'], + type='', + swap='sso_kpt20'), + 70: + dict( + name='sso_kpt13', + id=70, + color=colors['sso'], + type='', + swap='sso_kpt19'), + 71: + dict( + name='sso_kpt14', + id=71, + color=colors['sso'], + type='', + swap='sso_kpt18'), + 72: + dict( + name='sso_kpt15', + id=72, + color=colors['sso'], + type='', + swap='sso_kpt17'), + 73: + dict( + name='sso_kpt16', + id=73, + color=colors['sso'], + type='', + swap='sso_kpt29'), + 74: + dict( + name='sso_kpt17', + id=74, + color=colors['sso'], + type='', + swap='sso_kpt15'), + 75: + dict( + name='sso_kpt18', + id=75, + color=colors['sso'], + type='', + swap='sso_kpt14'), + 76: + dict( + name='sso_kpt19', + id=76, + color=colors['sso'], + type='', + swap='sso_kpt13'), + 77: + dict( + name='sso_kpt20', + id=77, + color=colors['sso'], + type='', + swap='sso_kpt12'), + 78: + dict( + name='sso_kpt21', + id=78, + color=colors['sso'], + type='', + swap='sso_kpt11'), + 79: + dict( + name='sso_kpt22', + id=79, + color=colors['sso'], + type='', + swap='sso_kpt10'), + 80: + dict( + name='sso_kpt23', + id=80, + color=colors['sso'], + type='', + swap='sso_kpt9'), + 81: + dict( + name='sso_kpt24', + id=81, + color=colors['sso'], + type='', + swap='sso_kpt8'), + 82: + dict( + name='sso_kpt25', + id=82, + color=colors['sso'], + type='', + swap='sso_kpt7'), + 83: + dict( + name='sso_kpt26', + id=83, + color=colors['sso'], + type='', + swap='sso_kpt2'), + 84: + dict( + name='sso_kpt27', + id=84, + color=colors['sso'], + type='', + swap='sso_kpt30'), + 85: + dict( + name='sso_kpt28', + id=85, + color=colors['sso'], + type='', + swap='sso_kpt31'), + 86: + dict( + name='sso_kpt29', + id=86, + color=colors['sso'], + type='', + swap='sso_kpt16'), + 87: + dict( + name='sso_kpt30', + id=87, + color=colors['sso'], + type='', + swap='sso_kpt27'), + 88: + dict( + name='sso_kpt31', + id=88, + color=colors['sso'], + type='', + swap='sso_kpt28'), + # long_sleeved_outwear + 89: + dict(name='lso_kpt1', id=89, color=colors['lso'], type='', swap=''), + 90: + dict( + name='lso_kpt2', + id=90, + color=colors['lso'], + type='', + swap='lso_kpt6'), + 91: + dict( + name='lso_kpt3', + id=91, + color=colors['lso'], + type='', + swap='lso_kpt5'), + 92: + dict( + name='lso_kpt4', + id=92, + color=colors['lso'], + type='', + swap='lso_kpt34'), + 93: + dict( + name='lso_kpt5', + id=93, + color=colors['lso'], + type='', + swap='lso_kpt3'), + 94: + dict( + name='lso_kpt6', + id=94, + color=colors['lso'], + type='', + swap='lso_kpt2'), + 95: + dict( + name='lso_kpt7', + id=95, + color=colors['lso'], + type='', + swap='lso_kpt33'), + 96: + dict( + name='lso_kpt8', + id=96, + color=colors['lso'], + type='', + swap='lso_kpt32'), + 97: + dict( + name='lso_kpt9', + id=97, + color=colors['lso'], + type='', + swap='lso_kpt31'), + 98: + dict( + name='lso_kpt10', + id=98, + color=colors['lso'], + type='', + swap='lso_kpt30'), + 99: + dict( + name='lso_kpt11', + id=99, + color=colors['lso'], + type='', + swap='lso_kpt29'), + 100: + dict( + name='lso_kpt12', + id=100, + color=colors['lso'], + type='', + swap='lso_kpt28'), + 101: + dict( + name='lso_kpt13', + id=101, + color=colors['lso'], + type='', + swap='lso_kpt27'), + 102: + dict( + name='lso_kpt14', + id=102, + color=colors['lso'], + type='', + swap='lso_kpt26'), + 103: + dict( + name='lso_kpt15', + id=103, + color=colors['lso'], + type='', + swap='lso_kpt25'), + 104: + dict( + name='lso_kpt16', + id=104, + color=colors['lso'], + type='', + swap='lso_kpt24'), + 105: + dict( + name='lso_kpt17', + id=105, + color=colors['lso'], + type='', + swap='lso_kpt23'), + 106: + dict( + name='lso_kpt18', + id=106, + color=colors['lso'], + type='', + swap='lso_kpt22'), + 107: + dict( + name='lso_kpt19', + id=107, + color=colors['lso'], + type='', + swap='lso_kpt21'), + 108: + dict( + name='lso_kpt20', + id=108, + color=colors['lso'], + type='', + swap='lso_kpt37'), + 109: + dict( + name='lso_kpt21', + id=109, + color=colors['lso'], + type='', + swap='lso_kpt19'), + 110: + dict( + name='lso_kpt22', + id=110, + color=colors['lso'], + type='', + swap='lso_kpt18'), + 111: + dict( + name='lso_kpt23', + id=111, + color=colors['lso'], + type='', + swap='lso_kpt17'), + 112: + dict( + name='lso_kpt24', + id=112, + color=colors['lso'], + type='', + swap='lso_kpt16'), + 113: + dict( + name='lso_kpt25', + id=113, + color=colors['lso'], + type='', + swap='lso_kpt15'), + 114: + dict( + name='lso_kpt26', + id=114, + color=colors['lso'], + type='', + swap='lso_kpt14'), + 115: + dict( + name='lso_kpt27', + id=115, + color=colors['lso'], + type='', + swap='lso_kpt13'), + 116: + dict( + name='lso_kpt28', + id=116, + color=colors['lso'], + type='', + swap='lso_kpt12'), + 117: + dict( + name='lso_kpt29', + id=117, + color=colors['lso'], + type='', + swap='lso_kpt11'), + 118: + dict( + name='lso_kpt30', + id=118, + color=colors['lso'], + type='', + swap='lso_kpt10'), + 119: + dict( + name='lso_kpt31', + id=119, + color=colors['lso'], + type='', + swap='lso_kpt9'), + 120: + dict( + name='lso_kpt32', + id=120, + color=colors['lso'], + type='', + swap='lso_kpt8'), + 121: + dict( + name='lso_kpt33', + id=121, + color=colors['lso'], + type='', + swap='lso_kpt7'), + 122: + dict( + name='lso_kpt34', + id=122, + color=colors['lso'], + type='', + swap='lso_kpt4'), + 123: + dict( + name='lso_kpt35', + id=123, + color=colors['lso'], + type='', + swap='lso_kpt38'), + 124: + dict( + name='lso_kpt36', + id=124, + color=colors['lso'], + type='', + swap='lso_kpt39'), + 125: + dict( + name='lso_kpt37', + id=125, + color=colors['lso'], + type='', + swap='lso_kpt20'), + 126: + dict( + name='lso_kpt38', + id=126, + color=colors['lso'], + type='', + swap='lso_kpt35'), + 127: + dict( + name='lso_kpt39', + id=127, + color=colors['lso'], + type='', + swap='lso_kpt36'), + # vest + 128: + dict(name='vest_kpt1', id=128, color=colors['vest'], type='', swap=''), + 129: + dict( + name='vest_kpt2', + id=129, + color=colors['vest'], + type='', + swap='vest_kpt6'), + 130: + dict( + name='vest_kpt3', + id=130, + color=colors['vest'], + type='', + swap='vest_kpt5'), + 131: + dict(name='vest_kpt4', id=131, color=colors['vest'], type='', swap=''), + 132: + dict( + name='vest_kpt5', + id=132, + color=colors['vest'], + type='', + swap='vest_kpt3'), + 133: + dict( + name='vest_kpt6', + id=133, + color=colors['vest'], + type='', + swap='vest_kpt2'), + 134: + dict( + name='vest_kpt7', + id=134, + color=colors['vest'], + type='', + swap='vest_kpt15'), + 135: + dict( + name='vest_kpt8', + id=135, + color=colors['vest'], + type='', + swap='vest_kpt14'), + 136: + dict( + name='vest_kpt9', + id=136, + color=colors['vest'], + type='', + swap='vest_kpt13'), + 137: + dict( + name='vest_kpt10', + id=137, + color=colors['vest'], + type='', + swap='vest_kpt12'), + 138: + dict( + name='vest_kpt11', id=138, color=colors['vest'], type='', swap=''), + 139: + dict( + name='vest_kpt12', + id=139, + color=colors['vest'], + type='', + swap='vest_kpt10'), + 140: + dict( + name='vest_kpt13', id=140, color=colors['vest'], type='', swap=''), + 141: + dict( + name='vest_kpt14', + id=141, + color=colors['vest'], + type='', + swap='vest_kpt8'), + 142: + dict( + name='vest_kpt15', + id=142, + color=colors['vest'], + type='', + swap='vest_kpt7'), + # sling + 143: + dict( + name='sling_kpt1', id=143, color=colors['sling'], type='', + swap=''), + 144: + dict( + name='sling_kpt2', + id=144, + color=colors['sling'], + type='', + swap='sling_kpt6'), + 145: + dict( + name='sling_kpt3', + id=145, + color=colors['sling'], + type='', + swap='sling_kpt5'), + 146: + dict( + name='sling_kpt4', id=146, color=colors['sling'], type='', + swap=''), + 147: + dict( + name='sling_kpt5', + id=147, + color=colors['sling'], + type='', + swap='sling_kpt3'), + 148: + dict( + name='sling_kpt6', + id=148, + color=colors['sling'], + type='', + swap='sling_kpt2'), + 149: + dict( + name='sling_kpt7', + id=149, + color=colors['sling'], + type='', + swap='sling_kpt15'), + 150: + dict( + name='sling_kpt8', + id=150, + color=colors['sling'], + type='', + swap='sling_kpt14'), + 151: + dict( + name='sling_kpt9', + id=151, + color=colors['sling'], + type='', + swap='sling_kpt13'), + 152: + dict( + name='sling_kpt10', + id=152, + color=colors['sling'], + type='', + swap='sling_kpt12'), + 153: + dict( + name='sling_kpt11', + id=153, + color=colors['sling'], + type='', + swap=''), + 154: + dict( + name='sling_kpt12', + id=154, + color=colors['sling'], + type='', + swap='sling_kpt10'), + 155: + dict( + name='sling_kpt13', + id=155, + color=colors['sling'], + type='', + swap='sling_kpt9'), + 156: + dict( + name='sling_kpt14', + id=156, + color=colors['sling'], + type='', + swap='sling_kpt8'), + 157: + dict( + name='sling_kpt15', + id=157, + color=colors['sling'], + type='', + swap='sling_kpt7'), + # shorts + 158: + dict( + name='shorts_kpt1', + id=158, + color=colors['shorts'], + type='', + swap='shorts_kpt3'), + 159: + dict( + name='shorts_kpt2', + id=159, + color=colors['shorts'], + type='', + swap=''), + 160: + dict( + name='shorts_kpt3', + id=160, + color=colors['shorts'], + type='', + swap='shorts_kpt1'), + 161: + dict( + name='shorts_kpt4', + id=161, + color=colors['shorts'], + type='', + swap='shorts_kpt10'), + 162: + dict( + name='shorts_kpt5', + id=162, + color=colors['shorts'], + type='', + swap='shorts_kpt9'), + 163: + dict( + name='shorts_kpt6', + id=163, + color=colors['shorts'], + type='', + swap='shorts_kpt8'), + 164: + dict( + name='shorts_kpt7', + id=164, + color=colors['shorts'], + type='', + swap=''), + 165: + dict( + name='shorts_kpt8', + id=165, + color=colors['shorts'], + type='', + swap='shorts_kpt6'), + 166: + dict( + name='shorts_kpt9', + id=166, + color=colors['shorts'], + type='', + swap='shorts_kpt5'), + 167: + dict( + name='shorts_kpt10', + id=167, + color=colors['shorts'], + type='', + swap='shorts_kpt4'), + # trousers + 168: + dict( + name='trousers_kpt1', + id=168, + color=colors['trousers'], + type='', + swap='trousers_kpt3'), + 169: + dict( + name='trousers_kpt2', + id=169, + color=colors['trousers'], + type='', + swap=''), + 170: + dict( + name='trousers_kpt3', + id=170, + color=colors['trousers'], + type='', + swap='trousers_kpt1'), + 171: + dict( + name='trousers_kpt4', + id=171, + color=colors['trousers'], + type='', + swap='trousers_kpt14'), + 172: + dict( + name='trousers_kpt5', + id=172, + color=colors['trousers'], + type='', + swap='trousers_kpt13'), + 173: + dict( + name='trousers_kpt6', + id=173, + color=colors['trousers'], + type='', + swap='trousers_kpt12'), + 174: + dict( + name='trousers_kpt7', + id=174, + color=colors['trousers'], + type='', + swap='trousers_kpt11'), + 175: + dict( + name='trousers_kpt8', + id=175, + color=colors['trousers'], + type='', + swap='trousers_kpt10'), + 176: + dict( + name='trousers_kpt9', + id=176, + color=colors['trousers'], + type='', + swap=''), + 177: + dict( + name='trousers_kpt10', + id=177, + color=colors['trousers'], + type='', + swap='trousers_kpt8'), + 178: + dict( + name='trousers_kpt11', + id=178, + color=colors['trousers'], + type='', + swap='trousers_kpt7'), + 179: + dict( + name='trousers_kpt12', + id=179, + color=colors['trousers'], + type='', + swap='trousers_kpt6'), + 180: + dict( + name='trousers_kpt13', + id=180, + color=colors['trousers'], + type='', + swap='trousers_kpt5'), + 181: + dict( + name='trousers_kpt14', + id=181, + color=colors['trousers'], + type='', + swap='trousers_kpt4'), + # skirt + 182: + dict( + name='skirt_kpt1', + id=182, + color=colors['skirt'], + type='', + swap='skirt_kpt3'), + 183: + dict( + name='skirt_kpt2', id=183, color=colors['skirt'], type='', + swap=''), + 184: + dict( + name='skirt_kpt3', + id=184, + color=colors['skirt'], + type='', + swap='skirt_kpt1'), + 185: + dict( + name='skirt_kpt4', + id=185, + color=colors['skirt'], + type='', + swap='skirt_kpt8'), + 186: + dict( + name='skirt_kpt5', + id=186, + color=colors['skirt'], + type='', + swap='skirt_kpt7'), + 187: + dict( + name='skirt_kpt6', id=187, color=colors['skirt'], type='', + swap=''), + 188: + dict( + name='skirt_kpt7', + id=188, + color=colors['skirt'], + type='', + swap='skirt_kpt5'), + 189: + dict( + name='skirt_kpt8', + id=189, + color=colors['skirt'], + type='', + swap='skirt_kpt4'), + # short_sleeved_dress + 190: + dict(name='ssd_kpt1', id=190, color=colors['ssd'], type='', swap=''), + 191: + dict( + name='ssd_kpt2', + id=191, + color=colors['ssd'], + type='', + swap='ssd_kpt6'), + 192: + dict( + name='ssd_kpt3', + id=192, + color=colors['ssd'], + type='', + swap='ssd_kpt5'), + 193: + dict(name='ssd_kpt4', id=193, color=colors['ssd'], type='', swap=''), + 194: + dict( + name='ssd_kpt5', + id=194, + color=colors['ssd'], + type='', + swap='ssd_kpt3'), + 195: + dict( + name='ssd_kpt6', + id=195, + color=colors['ssd'], + type='', + swap='ssd_kpt2'), + 196: + dict( + name='ssd_kpt7', + id=196, + color=colors['ssd'], + type='', + swap='ssd_kpt29'), + 197: + dict( + name='ssd_kpt8', + id=197, + color=colors['ssd'], + type='', + swap='ssd_kpt28'), + 198: + dict( + name='ssd_kpt9', + id=198, + color=colors['ssd'], + type='', + swap='ssd_kpt27'), + 199: + dict( + name='ssd_kpt10', + id=199, + color=colors['ssd'], + type='', + swap='ssd_kpt26'), + 200: + dict( + name='ssd_kpt11', + id=200, + color=colors['ssd'], + type='', + swap='ssd_kpt25'), + 201: + dict( + name='ssd_kpt12', + id=201, + color=colors['ssd'], + type='', + swap='ssd_kpt24'), + 202: + dict( + name='ssd_kpt13', + id=202, + color=colors['ssd'], + type='', + swap='ssd_kpt23'), + 203: + dict( + name='ssd_kpt14', + id=203, + color=colors['ssd'], + type='', + swap='ssd_kpt22'), + 204: + dict( + name='ssd_kpt15', + id=204, + color=colors['ssd'], + type='', + swap='ssd_kpt21'), + 205: + dict( + name='ssd_kpt16', + id=205, + color=colors['ssd'], + type='', + swap='ssd_kpt20'), + 206: + dict( + name='ssd_kpt17', + id=206, + color=colors['ssd'], + type='', + swap='ssd_kpt19'), + 207: + dict(name='ssd_kpt18', id=207, color=colors['ssd'], type='', swap=''), + 208: + dict( + name='ssd_kpt19', + id=208, + color=colors['ssd'], + type='', + swap='ssd_kpt17'), + 209: + dict( + name='ssd_kpt20', + id=209, + color=colors['ssd'], + type='', + swap='ssd_kpt16'), + 210: + dict( + name='ssd_kpt21', + id=210, + color=colors['ssd'], + type='', + swap='ssd_kpt15'), + 211: + dict( + name='ssd_kpt22', + id=211, + color=colors['ssd'], + type='', + swap='ssd_kpt14'), + 212: + dict( + name='ssd_kpt23', + id=212, + color=colors['ssd'], + type='', + swap='ssd_kpt13'), + 213: + dict( + name='ssd_kpt24', + id=213, + color=colors['ssd'], + type='', + swap='ssd_kpt12'), + 214: + dict( + name='ssd_kpt25', + id=214, + color=colors['ssd'], + type='', + swap='ssd_kpt11'), + 215: + dict( + name='ssd_kpt26', + id=215, + color=colors['ssd'], + type='', + swap='ssd_kpt10'), + 216: + dict( + name='ssd_kpt27', + id=216, + color=colors['ssd'], + type='', + swap='ssd_kpt9'), + 217: + dict( + name='ssd_kpt28', + id=217, + color=colors['ssd'], + type='', + swap='ssd_kpt8'), + 218: + dict( + name='ssd_kpt29', + id=218, + color=colors['ssd'], + type='', + swap='ssd_kpt7'), + # long_sleeved_dress + 219: + dict(name='lsd_kpt1', id=219, color=colors['lsd'], type='', swap=''), + 220: + dict( + name='lsd_kpt2', + id=220, + color=colors['lsd'], + type='', + swap='lsd_kpt6'), + 221: + dict( + name='lsd_kpt3', + id=221, + color=colors['lsd'], + type='', + swap='lsd_kpt5'), + 222: + dict(name='lsd_kpt4', id=222, color=colors['lsd'], type='', swap=''), + 223: + dict( + name='lsd_kpt5', + id=223, + color=colors['lsd'], + type='', + swap='lsd_kpt3'), + 224: + dict( + name='lsd_kpt6', + id=224, + color=colors['lsd'], + type='', + swap='lsd_kpt2'), + 225: + dict( + name='lsd_kpt7', + id=225, + color=colors['lsd'], + type='', + swap='lsd_kpt37'), + 226: + dict( + name='lsd_kpt8', + id=226, + color=colors['lsd'], + type='', + swap='lsd_kpt36'), + 227: + dict( + name='lsd_kpt9', + id=227, + color=colors['lsd'], + type='', + swap='lsd_kpt35'), + 228: + dict( + name='lsd_kpt10', + id=228, + color=colors['lsd'], + type='', + swap='lsd_kpt34'), + 229: + dict( + name='lsd_kpt11', + id=229, + color=colors['lsd'], + type='', + swap='lsd_kpt33'), + 230: + dict( + name='lsd_kpt12', + id=230, + color=colors['lsd'], + type='', + swap='lsd_kpt32'), + 231: + dict( + name='lsd_kpt13', + id=231, + color=colors['lsd'], + type='', + swap='lsd_kpt31'), + 232: + dict( + name='lsd_kpt14', + id=232, + color=colors['lsd'], + type='', + swap='lsd_kpt30'), + 233: + dict( + name='lsd_kpt15', + id=233, + color=colors['lsd'], + type='', + swap='lsd_kpt29'), + 234: + dict( + name='lsd_kpt16', + id=234, + color=colors['lsd'], + type='', + swap='lsd_kpt28'), + 235: + dict( + name='lsd_kpt17', + id=235, + color=colors['lsd'], + type='', + swap='lsd_kpt27'), + 236: + dict( + name='lsd_kpt18', + id=236, + color=colors['lsd'], + type='', + swap='lsd_kpt26'), + 237: + dict( + name='lsd_kpt19', + id=237, + color=colors['lsd'], + type='', + swap='lsd_kpt25'), + 238: + dict( + name='lsd_kpt20', + id=238, + color=colors['lsd'], + type='', + swap='lsd_kpt24'), + 239: + dict( + name='lsd_kpt21', + id=239, + color=colors['lsd'], + type='', + swap='lsd_kpt23'), + 240: + dict(name='lsd_kpt22', id=240, color=colors['lsd'], type='', swap=''), + 241: + dict( + name='lsd_kpt23', + id=241, + color=colors['lsd'], + type='', + swap='lsd_kpt21'), + 242: + dict( + name='lsd_kpt24', + id=242, + color=colors['lsd'], + type='', + swap='lsd_kpt20'), + 243: + dict( + name='lsd_kpt25', + id=243, + color=colors['lsd'], + type='', + swap='lsd_kpt19'), + 244: + dict( + name='lsd_kpt26', + id=244, + color=colors['lsd'], + type='', + swap='lsd_kpt18'), + 245: + dict( + name='lsd_kpt27', + id=245, + color=colors['lsd'], + type='', + swap='lsd_kpt17'), + 246: + dict( + name='lsd_kpt28', + id=246, + color=colors['lsd'], + type='', + swap='lsd_kpt16'), + 247: + dict( + name='lsd_kpt29', + id=247, + color=colors['lsd'], + type='', + swap='lsd_kpt15'), + 248: + dict( + name='lsd_kpt30', + id=248, + color=colors['lsd'], + type='', + swap='lsd_kpt14'), + 249: + dict( + name='lsd_kpt31', + id=249, + color=colors['lsd'], + type='', + swap='lsd_kpt13'), + 250: + dict( + name='lsd_kpt32', + id=250, + color=colors['lsd'], + type='', + swap='lsd_kpt12'), + 251: + dict( + name='lsd_kpt33', + id=251, + color=colors['lsd'], + type='', + swap='lsd_kpt11'), + 252: + dict( + name='lsd_kpt34', + id=252, + color=colors['lsd'], + type='', + swap='lsd_kpt10'), + 253: + dict( + name='lsd_kpt35', + id=253, + color=colors['lsd'], + type='', + swap='lsd_kpt9'), + 254: + dict( + name='lsd_kpt36', + id=254, + color=colors['lsd'], + type='', + swap='lsd_kpt8'), + 255: + dict( + name='lsd_kpt37', + id=255, + color=colors['lsd'], + type='', + swap='lsd_kpt7'), + # vest_dress + 256: + dict(name='vd_kpt1', id=256, color=colors['vd'], type='', swap=''), + 257: + dict( + name='vd_kpt2', + id=257, + color=colors['vd'], + type='', + swap='vd_kpt6'), + 258: + dict( + name='vd_kpt3', + id=258, + color=colors['vd'], + type='', + swap='vd_kpt5'), + 259: + dict(name='vd_kpt4', id=259, color=colors['vd'], type='', swap=''), + 260: + dict( + name='vd_kpt5', + id=260, + color=colors['vd'], + type='', + swap='vd_kpt3'), + 261: + dict( + name='vd_kpt6', + id=261, + color=colors['vd'], + type='', + swap='vd_kpt2'), + 262: + dict( + name='vd_kpt7', + id=262, + color=colors['vd'], + type='', + swap='vd_kpt19'), + 263: + dict( + name='vd_kpt8', + id=263, + color=colors['vd'], + type='', + swap='vd_kpt18'), + 264: + dict( + name='vd_kpt9', + id=264, + color=colors['vd'], + type='', + swap='vd_kpt17'), + 265: + dict( + name='vd_kpt10', + id=265, + color=colors['vd'], + type='', + swap='vd_kpt16'), + 266: + dict( + name='vd_kpt11', + id=266, + color=colors['vd'], + type='', + swap='vd_kpt15'), + 267: + dict( + name='vd_kpt12', + id=267, + color=colors['vd'], + type='', + swap='vd_kpt14'), + 268: + dict(name='vd_kpt13', id=268, color=colors['vd'], type='', swap=''), + 269: + dict( + name='vd_kpt14', + id=269, + color=colors['vd'], + type='', + swap='vd_kpt12'), + 270: + dict( + name='vd_kpt15', + id=270, + color=colors['vd'], + type='', + swap='vd_kpt11'), + 271: + dict( + name='vd_kpt16', + id=271, + color=colors['vd'], + type='', + swap='vd_kpt10'), + 272: + dict( + name='vd_kpt17', + id=272, + color=colors['vd'], + type='', + swap='vd_kpt9'), + 273: + dict( + name='vd_kpt18', + id=273, + color=colors['vd'], + type='', + swap='vd_kpt8'), + 274: + dict( + name='vd_kpt19', + id=274, + color=colors['vd'], + type='', + swap='vd_kpt7'), + # sling_dress + 275: + dict(name='sd_kpt1', id=275, color=colors['sd'], type='', swap=''), + 276: + dict( + name='sd_kpt2', + id=276, + color=colors['sd'], + type='', + swap='sd_kpt6'), + 277: + dict( + name='sd_kpt3', + id=277, + color=colors['sd'], + type='', + swap='sd_kpt5'), + 278: + dict(name='sd_kpt4', id=278, color=colors['sd'], type='', swap=''), + 279: + dict( + name='sd_kpt5', + id=279, + color=colors['sd'], + type='', + swap='sd_kpt3'), + 280: + dict( + name='sd_kpt6', + id=280, + color=colors['sd'], + type='', + swap='sd_kpt2'), + 281: + dict( + name='sd_kpt7', + id=281, + color=colors['sd'], + type='', + swap='sd_kpt19'), + 282: + dict( + name='sd_kpt8', + id=282, + color=colors['sd'], + type='', + swap='sd_kpt18'), + 283: + dict( + name='sd_kpt9', + id=283, + color=colors['sd'], + type='', + swap='sd_kpt17'), + 284: + dict( + name='sd_kpt10', + id=284, + color=colors['sd'], + type='', + swap='sd_kpt16'), + 285: + dict( + name='sd_kpt11', + id=285, + color=colors['sd'], + type='', + swap='sd_kpt15'), + 286: + dict( + name='sd_kpt12', + id=286, + color=colors['sd'], + type='', + swap='sd_kpt14'), + 287: + dict(name='sd_kpt13', id=287, color=colors['sd'], type='', swap=''), + 288: + dict( + name='sd_kpt14', + id=288, + color=colors['sd'], + type='', + swap='sd_kpt12'), + 289: + dict( + name='sd_kpt15', + id=289, + color=colors['sd'], + type='', + swap='sd_kpt11'), + 290: + dict( + name='sd_kpt16', + id=290, + color=colors['sd'], + type='', + swap='sd_kpt10'), + 291: + dict( + name='sd_kpt17', + id=291, + color=colors['sd'], + type='', + swap='sd_kpt9'), + 292: + dict( + name='sd_kpt18', + id=292, + color=colors['sd'], + type='', + swap='sd_kpt8'), + 293: + dict( + name='sd_kpt19', + id=293, + color=colors['sd'], + type='', + swap='sd_kpt7'), + }, + skeleton_info={ + # short_sleeved_shirt + 0: + dict(link=('sss_kpt1', 'sss_kpt2'), id=0, color=[255, 128, 0]), + 1: + dict(link=('sss_kpt2', 'sss_kpt7'), id=1, color=[255, 128, 0]), + 2: + dict(link=('sss_kpt7', 'sss_kpt8'), id=2, color=[255, 128, 0]), + 3: + dict(link=('sss_kpt8', 'sss_kpt9'), id=3, color=[255, 128, 0]), + 4: + dict(link=('sss_kpt9', 'sss_kpt10'), id=4, color=[255, 128, 0]), + 5: + dict(link=('sss_kpt10', 'sss_kpt11'), id=5, color=[255, 128, 0]), + 6: + dict(link=('sss_kpt11', 'sss_kpt12'), id=6, color=[255, 128, 0]), + 7: + dict(link=('sss_kpt12', 'sss_kpt13'), id=7, color=[255, 128, 0]), + 8: + dict(link=('sss_kpt13', 'sss_kpt14'), id=8, color=[255, 128, 0]), + 9: + dict(link=('sss_kpt14', 'sss_kpt15'), id=9, color=[255, 128, 0]), + 10: + dict(link=('sss_kpt15', 'sss_kpt16'), id=10, color=[255, 128, 0]), + 11: + dict(link=('sss_kpt16', 'sss_kpt17'), id=11, color=[255, 128, 0]), + 12: + dict(link=('sss_kpt17', 'sss_kpt18'), id=12, color=[255, 128, 0]), + 13: + dict(link=('sss_kpt18', 'sss_kpt19'), id=13, color=[255, 128, 0]), + 14: + dict(link=('sss_kpt19', 'sss_kpt20'), id=14, color=[255, 128, 0]), + 15: + dict(link=('sss_kpt20', 'sss_kpt21'), id=15, color=[255, 128, 0]), + 16: + dict(link=('sss_kpt21', 'sss_kpt22'), id=16, color=[255, 128, 0]), + 17: + dict(link=('sss_kpt22', 'sss_kpt23'), id=17, color=[255, 128, 0]), + 18: + dict(link=('sss_kpt23', 'sss_kpt24'), id=18, color=[255, 128, 0]), + 19: + dict(link=('sss_kpt24', 'sss_kpt25'), id=19, color=[255, 128, 0]), + 20: + dict(link=('sss_kpt25', 'sss_kpt6'), id=20, color=[255, 128, 0]), + 21: + dict(link=('sss_kpt6', 'sss_kpt1'), id=21, color=[255, 128, 0]), + 22: + dict(link=('sss_kpt2', 'sss_kpt3'), id=22, color=[255, 128, 0]), + 23: + dict(link=('sss_kpt3', 'sss_kpt4'), id=23, color=[255, 128, 0]), + 24: + dict(link=('sss_kpt4', 'sss_kpt5'), id=24, color=[255, 128, 0]), + 25: + dict(link=('sss_kpt5', 'sss_kpt6'), id=25, color=[255, 128, 0]), + # long_sleeve_shirt + 26: + dict(link=('lss_kpt1', 'lss_kpt2'), id=26, color=[255, 0, 128]), + 27: + dict(link=('lss_kpt2', 'lss_kpt7'), id=27, color=[255, 0, 128]), + 28: + dict(link=('lss_kpt7', 'lss_kpt8'), id=28, color=[255, 0, 128]), + 29: + dict(link=('lss_kpt8', 'lss_kpt9'), id=29, color=[255, 0, 128]), + 30: + dict(link=('lss_kpt9', 'lss_kpt10'), id=30, color=[255, 0, 128]), + 31: + dict(link=('lss_kpt10', 'lss_kpt11'), id=31, color=[255, 0, 128]), + 32: + dict(link=('lss_kpt11', 'lss_kpt12'), id=32, color=[255, 0, 128]), + 33: + dict(link=('lss_kpt12', 'lss_kpt13'), id=33, color=[255, 0, 128]), + 34: + dict(link=('lss_kpt13', 'lss_kpt14'), id=34, color=[255, 0, 128]), + 35: + dict(link=('lss_kpt14', 'lss_kpt15'), id=35, color=[255, 0, 128]), + 36: + dict(link=('lss_kpt15', 'lss_kpt16'), id=36, color=[255, 0, 128]), + 37: + dict(link=('lss_kpt16', 'lss_kpt17'), id=37, color=[255, 0, 128]), + 38: + dict(link=('lss_kpt17', 'lss_kpt18'), id=38, color=[255, 0, 128]), + 39: + dict(link=('lss_kpt18', 'lss_kpt19'), id=39, color=[255, 0, 128]), + 40: + dict(link=('lss_kpt19', 'lss_kpt20'), id=40, color=[255, 0, 128]), + 41: + dict(link=('lss_kpt20', 'lss_kpt21'), id=41, color=[255, 0, 128]), + 42: + dict(link=('lss_kpt21', 'lss_kpt22'), id=42, color=[255, 0, 128]), + 43: + dict(link=('lss_kpt22', 'lss_kpt23'), id=43, color=[255, 0, 128]), + 44: + dict(link=('lss_kpt23', 'lss_kpt24'), id=44, color=[255, 0, 128]), + 45: + dict(link=('lss_kpt24', 'lss_kpt25'), id=45, color=[255, 0, 128]), + 46: + dict(link=('lss_kpt25', 'lss_kpt26'), id=46, color=[255, 0, 128]), + 47: + dict(link=('lss_kpt26', 'lss_kpt27'), id=47, color=[255, 0, 128]), + 48: + dict(link=('lss_kpt27', 'lss_kpt28'), id=48, color=[255, 0, 128]), + 49: + dict(link=('lss_kpt28', 'lss_kpt29'), id=49, color=[255, 0, 128]), + 50: + dict(link=('lss_kpt29', 'lss_kpt30'), id=50, color=[255, 0, 128]), + 51: + dict(link=('lss_kpt30', 'lss_kpt31'), id=51, color=[255, 0, 128]), + 52: + dict(link=('lss_kpt31', 'lss_kpt32'), id=52, color=[255, 0, 128]), + 53: + dict(link=('lss_kpt32', 'lss_kpt33'), id=53, color=[255, 0, 128]), + 54: + dict(link=('lss_kpt33', 'lss_kpt6'), id=54, color=[255, 0, 128]), + 55: + dict(link=('lss_kpt6', 'lss_kpt5'), id=55, color=[255, 0, 128]), + 56: + dict(link=('lss_kpt5', 'lss_kpt4'), id=56, color=[255, 0, 128]), + 57: + dict(link=('lss_kpt4', 'lss_kpt3'), id=57, color=[255, 0, 128]), + 58: + dict(link=('lss_kpt3', 'lss_kpt2'), id=58, color=[255, 0, 128]), + 59: + dict(link=('lss_kpt6', 'lss_kpt1'), id=59, color=[255, 0, 128]), + # short_sleeved_outwear + 60: + dict(link=('sso_kpt1', 'sso_kpt4'), id=60, color=[128, 0, 255]), + 61: + dict(link=('sso_kpt4', 'sso_kpt7'), id=61, color=[128, 0, 255]), + 62: + dict(link=('sso_kpt7', 'sso_kpt8'), id=62, color=[128, 0, 255]), + 63: + dict(link=('sso_kpt8', 'sso_kpt9'), id=63, color=[128, 0, 255]), + 64: + dict(link=('sso_kpt9', 'sso_kpt10'), id=64, color=[128, 0, 255]), + 65: + dict(link=('sso_kpt10', 'sso_kpt11'), id=65, color=[128, 0, 255]), + 66: + dict(link=('sso_kpt11', 'sso_kpt12'), id=66, color=[128, 0, 255]), + 67: + dict(link=('sso_kpt12', 'sso_kpt13'), id=67, color=[128, 0, 255]), + 68: + dict(link=('sso_kpt13', 'sso_kpt14'), id=68, color=[128, 0, 255]), + 69: + dict(link=('sso_kpt14', 'sso_kpt15'), id=69, color=[128, 0, 255]), + 70: + dict(link=('sso_kpt15', 'sso_kpt16'), id=70, color=[128, 0, 255]), + 71: + dict(link=('sso_kpt16', 'sso_kpt31'), id=71, color=[128, 0, 255]), + 72: + dict(link=('sso_kpt31', 'sso_kpt30'), id=72, color=[128, 0, 255]), + 73: + dict(link=('sso_kpt30', 'sso_kpt2'), id=73, color=[128, 0, 255]), + 74: + dict(link=('sso_kpt2', 'sso_kpt3'), id=74, color=[128, 0, 255]), + 75: + dict(link=('sso_kpt3', 'sso_kpt4'), id=75, color=[128, 0, 255]), + 76: + dict(link=('sso_kpt1', 'sso_kpt6'), id=76, color=[128, 0, 255]), + 77: + dict(link=('sso_kpt6', 'sso_kpt25'), id=77, color=[128, 0, 255]), + 78: + dict(link=('sso_kpt25', 'sso_kpt24'), id=78, color=[128, 0, 255]), + 79: + dict(link=('sso_kpt24', 'sso_kpt23'), id=79, color=[128, 0, 255]), + 80: + dict(link=('sso_kpt23', 'sso_kpt22'), id=80, color=[128, 0, 255]), + 81: + dict(link=('sso_kpt22', 'sso_kpt21'), id=81, color=[128, 0, 255]), + 82: + dict(link=('sso_kpt21', 'sso_kpt20'), id=82, color=[128, 0, 255]), + 83: + dict(link=('sso_kpt20', 'sso_kpt19'), id=83, color=[128, 0, 255]), + 84: + dict(link=('sso_kpt19', 'sso_kpt18'), id=84, color=[128, 0, 255]), + 85: + dict(link=('sso_kpt18', 'sso_kpt17'), id=85, color=[128, 0, 255]), + 86: + dict(link=('sso_kpt17', 'sso_kpt29'), id=86, color=[128, 0, 255]), + 87: + dict(link=('sso_kpt29', 'sso_kpt28'), id=87, color=[128, 0, 255]), + 88: + dict(link=('sso_kpt28', 'sso_kpt27'), id=88, color=[128, 0, 255]), + 89: + dict(link=('sso_kpt27', 'sso_kpt26'), id=89, color=[128, 0, 255]), + 90: + dict(link=('sso_kpt26', 'sso_kpt5'), id=90, color=[128, 0, 255]), + 91: + dict(link=('sso_kpt5', 'sso_kpt6'), id=91, color=[128, 0, 255]), + # long_sleeved_outwear + 92: + dict(link=('lso_kpt1', 'lso_kpt2'), id=92, color=[0, 128, 255]), + 93: + dict(link=('lso_kpt2', 'lso_kpt7'), id=93, color=[0, 128, 255]), + 94: + dict(link=('lso_kpt7', 'lso_kpt8'), id=94, color=[0, 128, 255]), + 95: + dict(link=('lso_kpt8', 'lso_kpt9'), id=95, color=[0, 128, 255]), + 96: + dict(link=('lso_kpt9', 'lso_kpt10'), id=96, color=[0, 128, 255]), + 97: + dict(link=('lso_kpt10', 'lso_kpt11'), id=97, color=[0, 128, 255]), + 98: + dict(link=('lso_kpt11', 'lso_kpt12'), id=98, color=[0, 128, 255]), + 99: + dict(link=('lso_kpt12', 'lso_kpt13'), id=99, color=[0, 128, 255]), + 100: + dict(link=('lso_kpt13', 'lso_kpt14'), id=100, color=[0, 128, 255]), + 101: + dict(link=('lso_kpt14', 'lso_kpt15'), id=101, color=[0, 128, 255]), + 102: + dict(link=('lso_kpt15', 'lso_kpt16'), id=102, color=[0, 128, 255]), + 103: + dict(link=('lso_kpt16', 'lso_kpt17'), id=103, color=[0, 128, 255]), + 104: + dict(link=('lso_kpt17', 'lso_kpt18'), id=104, color=[0, 128, 255]), + 105: + dict(link=('lso_kpt18', 'lso_kpt19'), id=105, color=[0, 128, 255]), + 106: + dict(link=('lso_kpt19', 'lso_kpt20'), id=106, color=[0, 128, 255]), + 107: + dict(link=('lso_kpt20', 'lso_kpt39'), id=107, color=[0, 128, 255]), + 108: + dict(link=('lso_kpt39', 'lso_kpt38'), id=108, color=[0, 128, 255]), + 109: + dict(link=('lso_kpt38', 'lso_kpt4'), id=109, color=[0, 128, 255]), + 110: + dict(link=('lso_kpt4', 'lso_kpt3'), id=110, color=[0, 128, 255]), + 111: + dict(link=('lso_kpt3', 'lso_kpt2'), id=111, color=[0, 128, 255]), + 112: + dict(link=('lso_kpt1', 'lso_kpt6'), id=112, color=[0, 128, 255]), + 113: + dict(link=('lso_kpt6', 'lso_kpt33'), id=113, color=[0, 128, 255]), + 114: + dict(link=('lso_kpt33', 'lso_kpt32'), id=114, color=[0, 128, 255]), + 115: + dict(link=('lso_kpt32', 'lso_kpt31'), id=115, color=[0, 128, 255]), + 116: + dict(link=('lso_kpt31', 'lso_kpt30'), id=116, color=[0, 128, 255]), + 117: + dict(link=('lso_kpt30', 'lso_kpt29'), id=117, color=[0, 128, 255]), + 118: + dict(link=('lso_kpt29', 'lso_kpt28'), id=118, color=[0, 128, 255]), + 119: + dict(link=('lso_kpt28', 'lso_kpt27'), id=119, color=[0, 128, 255]), + 120: + dict(link=('lso_kpt27', 'lso_kpt26'), id=120, color=[0, 128, 255]), + 121: + dict(link=('lso_kpt26', 'lso_kpt25'), id=121, color=[0, 128, 255]), + 122: + dict(link=('lso_kpt25', 'lso_kpt24'), id=122, color=[0, 128, 255]), + 123: + dict(link=('lso_kpt24', 'lso_kpt23'), id=123, color=[0, 128, 255]), + 124: + dict(link=('lso_kpt23', 'lso_kpt22'), id=124, color=[0, 128, 255]), + 125: + dict(link=('lso_kpt22', 'lso_kpt21'), id=125, color=[0, 128, 255]), + 126: + dict(link=('lso_kpt21', 'lso_kpt37'), id=126, color=[0, 128, 255]), + 127: + dict(link=('lso_kpt37', 'lso_kpt36'), id=127, color=[0, 128, 255]), + 128: + dict(link=('lso_kpt36', 'lso_kpt35'), id=128, color=[0, 128, 255]), + 129: + dict(link=('lso_kpt35', 'lso_kpt34'), id=129, color=[0, 128, 255]), + 130: + dict(link=('lso_kpt34', 'lso_kpt5'), id=130, color=[0, 128, 255]), + 131: + dict(link=('lso_kpt5', 'lso_kpt6'), id=131, color=[0, 128, 255]), + # vest + 132: + dict(link=('vest_kpt1', 'vest_kpt2'), id=132, color=[0, 128, 128]), + 133: + dict(link=('vest_kpt2', 'vest_kpt7'), id=133, color=[0, 128, 128]), + 134: + dict(link=('vest_kpt7', 'vest_kpt8'), id=134, color=[0, 128, 128]), + 135: + dict(link=('vest_kpt8', 'vest_kpt9'), id=135, color=[0, 128, 128]), + 136: + dict(link=('vest_kpt9', 'vest_kpt10'), id=136, color=[0, 128, 128]), + 137: + dict(link=('vest_kpt10', 'vest_kpt11'), id=137, color=[0, 128, 128]), + 138: + dict(link=('vest_kpt11', 'vest_kpt12'), id=138, color=[0, 128, 128]), + 139: + dict(link=('vest_kpt12', 'vest_kpt13'), id=139, color=[0, 128, 128]), + 140: + dict(link=('vest_kpt13', 'vest_kpt14'), id=140, color=[0, 128, 128]), + 141: + dict(link=('vest_kpt14', 'vest_kpt15'), id=141, color=[0, 128, 128]), + 142: + dict(link=('vest_kpt15', 'vest_kpt6'), id=142, color=[0, 128, 128]), + 143: + dict(link=('vest_kpt6', 'vest_kpt1'), id=143, color=[0, 128, 128]), + 144: + dict(link=('vest_kpt2', 'vest_kpt3'), id=144, color=[0, 128, 128]), + 145: + dict(link=('vest_kpt3', 'vest_kpt4'), id=145, color=[0, 128, 128]), + 146: + dict(link=('vest_kpt4', 'vest_kpt5'), id=146, color=[0, 128, 128]), + 147: + dict(link=('vest_kpt5', 'vest_kpt6'), id=147, color=[0, 128, 128]), + # sling + 148: + dict(link=('sling_kpt1', 'sling_kpt2'), id=148, color=[0, 0, 128]), + 149: + dict(link=('sling_kpt2', 'sling_kpt8'), id=149, color=[0, 0, 128]), + 150: + dict(link=('sling_kpt8', 'sling_kpt9'), id=150, color=[0, 0, 128]), + 151: + dict(link=('sling_kpt9', 'sling_kpt10'), id=151, color=[0, 0, 128]), + 152: + dict(link=('sling_kpt10', 'sling_kpt11'), id=152, color=[0, 0, 128]), + 153: + dict(link=('sling_kpt11', 'sling_kpt12'), id=153, color=[0, 0, 128]), + 154: + dict(link=('sling_kpt12', 'sling_kpt13'), id=154, color=[0, 0, 128]), + 155: + dict(link=('sling_kpt13', 'sling_kpt14'), id=155, color=[0, 0, 128]), + 156: + dict(link=('sling_kpt14', 'sling_kpt6'), id=156, color=[0, 0, 128]), + 157: + dict(link=('sling_kpt2', 'sling_kpt7'), id=157, color=[0, 0, 128]), + 158: + dict(link=('sling_kpt6', 'sling_kpt15'), id=158, color=[0, 0, 128]), + 159: + dict(link=('sling_kpt2', 'sling_kpt3'), id=159, color=[0, 0, 128]), + 160: + dict(link=('sling_kpt3', 'sling_kpt4'), id=160, color=[0, 0, 128]), + 161: + dict(link=('sling_kpt4', 'sling_kpt5'), id=161, color=[0, 0, 128]), + 162: + dict(link=('sling_kpt5', 'sling_kpt6'), id=162, color=[0, 0, 128]), + 163: + dict(link=('sling_kpt1', 'sling_kpt6'), id=163, color=[0, 0, 128]), + # shorts + 164: + dict( + link=('shorts_kpt1', 'shorts_kpt4'), id=164, color=[128, 128, + 128]), + 165: + dict( + link=('shorts_kpt4', 'shorts_kpt5'), id=165, color=[128, 128, + 128]), + 166: + dict( + link=('shorts_kpt5', 'shorts_kpt6'), id=166, color=[128, 128, + 128]), + 167: + dict( + link=('shorts_kpt6', 'shorts_kpt7'), id=167, color=[128, 128, + 128]), + 168: + dict( + link=('shorts_kpt7', 'shorts_kpt8'), id=168, color=[128, 128, + 128]), + 169: + dict( + link=('shorts_kpt8', 'shorts_kpt9'), id=169, color=[128, 128, + 128]), + 170: + dict( + link=('shorts_kpt9', 'shorts_kpt10'), + id=170, + color=[128, 128, 128]), + 171: + dict( + link=('shorts_kpt10', 'shorts_kpt3'), + id=171, + color=[128, 128, 128]), + 172: + dict( + link=('shorts_kpt3', 'shorts_kpt2'), id=172, color=[128, 128, + 128]), + 173: + dict( + link=('shorts_kpt2', 'shorts_kpt1'), id=173, color=[128, 128, + 128]), + # trousers + 174: + dict( + link=('trousers_kpt1', 'trousers_kpt4'), + id=174, + color=[128, 0, 128]), + 175: + dict( + link=('trousers_kpt4', 'trousers_kpt5'), + id=175, + color=[128, 0, 128]), + 176: + dict( + link=('trousers_kpt5', 'trousers_kpt6'), + id=176, + color=[128, 0, 128]), + 177: + dict( + link=('trousers_kpt6', 'trousers_kpt7'), + id=177, + color=[128, 0, 128]), + 178: + dict( + link=('trousers_kpt7', 'trousers_kpt8'), + id=178, + color=[128, 0, 128]), + 179: + dict( + link=('trousers_kpt8', 'trousers_kpt9'), + id=179, + color=[128, 0, 128]), + 180: + dict( + link=('trousers_kpt9', 'trousers_kpt10'), + id=180, + color=[128, 0, 128]), + 181: + dict( + link=('trousers_kpt10', 'trousers_kpt11'), + id=181, + color=[128, 0, 128]), + 182: + dict( + link=('trousers_kpt11', 'trousers_kpt12'), + id=182, + color=[128, 0, 128]), + 183: + dict( + link=('trousers_kpt12', 'trousers_kpt13'), + id=183, + color=[128, 0, 128]), + 184: + dict( + link=('trousers_kpt13', 'trousers_kpt14'), + id=184, + color=[128, 0, 128]), + 185: + dict( + link=('trousers_kpt14', 'trousers_kpt3'), + id=185, + color=[128, 0, 128]), + 186: + dict( + link=('trousers_kpt3', 'trousers_kpt2'), + id=186, + color=[128, 0, 128]), + 187: + dict( + link=('trousers_kpt2', 'trousers_kpt1'), + id=187, + color=[128, 0, 128]), + # skirt + 188: + dict(link=('skirt_kpt1', 'skirt_kpt4'), id=188, color=[64, 128, 128]), + 189: + dict(link=('skirt_kpt4', 'skirt_kpt5'), id=189, color=[64, 128, 128]), + 190: + dict(link=('skirt_kpt5', 'skirt_kpt6'), id=190, color=[64, 128, 128]), + 191: + dict(link=('skirt_kpt6', 'skirt_kpt7'), id=191, color=[64, 128, 128]), + 192: + dict(link=('skirt_kpt7', 'skirt_kpt8'), id=192, color=[64, 128, 128]), + 193: + dict(link=('skirt_kpt8', 'skirt_kpt3'), id=193, color=[64, 128, 128]), + 194: + dict(link=('skirt_kpt3', 'skirt_kpt2'), id=194, color=[64, 128, 128]), + 195: + dict(link=('skirt_kpt2', 'skirt_kpt1'), id=195, color=[64, 128, 128]), + # short_sleeved_dress + 196: + dict(link=('ssd_kpt1', 'ssd_kpt2'), id=196, color=[64, 64, 128]), + 197: + dict(link=('ssd_kpt2', 'ssd_kpt7'), id=197, color=[64, 64, 128]), + 198: + dict(link=('ssd_kpt7', 'ssd_kpt8'), id=198, color=[64, 64, 128]), + 199: + dict(link=('ssd_kpt8', 'ssd_kpt9'), id=199, color=[64, 64, 128]), + 200: + dict(link=('ssd_kpt9', 'ssd_kpt10'), id=200, color=[64, 64, 128]), + 201: + dict(link=('ssd_kpt10', 'ssd_kpt11'), id=201, color=[64, 64, 128]), + 202: + dict(link=('ssd_kpt11', 'ssd_kpt12'), id=202, color=[64, 64, 128]), + 203: + dict(link=('ssd_kpt12', 'ssd_kpt13'), id=203, color=[64, 64, 128]), + 204: + dict(link=('ssd_kpt13', 'ssd_kpt14'), id=204, color=[64, 64, 128]), + 205: + dict(link=('ssd_kpt14', 'ssd_kpt15'), id=205, color=[64, 64, 128]), + 206: + dict(link=('ssd_kpt15', 'ssd_kpt16'), id=206, color=[64, 64, 128]), + 207: + dict(link=('ssd_kpt16', 'ssd_kpt17'), id=207, color=[64, 64, 128]), + 208: + dict(link=('ssd_kpt17', 'ssd_kpt18'), id=208, color=[64, 64, 128]), + 209: + dict(link=('ssd_kpt18', 'ssd_kpt19'), id=209, color=[64, 64, 128]), + 210: + dict(link=('ssd_kpt19', 'ssd_kpt20'), id=210, color=[64, 64, 128]), + 211: + dict(link=('ssd_kpt20', 'ssd_kpt21'), id=211, color=[64, 64, 128]), + 212: + dict(link=('ssd_kpt21', 'ssd_kpt22'), id=212, color=[64, 64, 128]), + 213: + dict(link=('ssd_kpt22', 'ssd_kpt23'), id=213, color=[64, 64, 128]), + 214: + dict(link=('ssd_kpt23', 'ssd_kpt24'), id=214, color=[64, 64, 128]), + 215: + dict(link=('ssd_kpt24', 'ssd_kpt25'), id=215, color=[64, 64, 128]), + 216: + dict(link=('ssd_kpt25', 'ssd_kpt26'), id=216, color=[64, 64, 128]), + 217: + dict(link=('ssd_kpt26', 'ssd_kpt27'), id=217, color=[64, 64, 128]), + 218: + dict(link=('ssd_kpt27', 'ssd_kpt28'), id=218, color=[64, 64, 128]), + 219: + dict(link=('ssd_kpt28', 'ssd_kpt29'), id=219, color=[64, 64, 128]), + 220: + dict(link=('ssd_kpt29', 'ssd_kpt6'), id=220, color=[64, 64, 128]), + 221: + dict(link=('ssd_kpt6', 'ssd_kpt5'), id=221, color=[64, 64, 128]), + 222: + dict(link=('ssd_kpt5', 'ssd_kpt4'), id=222, color=[64, 64, 128]), + 223: + dict(link=('ssd_kpt4', 'ssd_kpt3'), id=223, color=[64, 64, 128]), + 224: + dict(link=('ssd_kpt3', 'ssd_kpt2'), id=224, color=[64, 64, 128]), + 225: + dict(link=('ssd_kpt6', 'ssd_kpt1'), id=225, color=[64, 64, 128]), + # long_sleeved_dress + 226: + dict(link=('lsd_kpt1', 'lsd_kpt2'), id=226, color=[128, 64, 0]), + 227: + dict(link=('lsd_kpt2', 'lsd_kpt7'), id=228, color=[128, 64, 0]), + 228: + dict(link=('lsd_kpt7', 'lsd_kpt8'), id=228, color=[128, 64, 0]), + 229: + dict(link=('lsd_kpt8', 'lsd_kpt9'), id=229, color=[128, 64, 0]), + 230: + dict(link=('lsd_kpt9', 'lsd_kpt10'), id=230, color=[128, 64, 0]), + 231: + dict(link=('lsd_kpt10', 'lsd_kpt11'), id=231, color=[128, 64, 0]), + 232: + dict(link=('lsd_kpt11', 'lsd_kpt12'), id=232, color=[128, 64, 0]), + 233: + dict(link=('lsd_kpt12', 'lsd_kpt13'), id=233, color=[128, 64, 0]), + 234: + dict(link=('lsd_kpt13', 'lsd_kpt14'), id=234, color=[128, 64, 0]), + 235: + dict(link=('lsd_kpt14', 'lsd_kpt15'), id=235, color=[128, 64, 0]), + 236: + dict(link=('lsd_kpt15', 'lsd_kpt16'), id=236, color=[128, 64, 0]), + 237: + dict(link=('lsd_kpt16', 'lsd_kpt17'), id=237, color=[128, 64, 0]), + 238: + dict(link=('lsd_kpt17', 'lsd_kpt18'), id=238, color=[128, 64, 0]), + 239: + dict(link=('lsd_kpt18', 'lsd_kpt19'), id=239, color=[128, 64, 0]), + 240: + dict(link=('lsd_kpt19', 'lsd_kpt20'), id=240, color=[128, 64, 0]), + 241: + dict(link=('lsd_kpt20', 'lsd_kpt21'), id=241, color=[128, 64, 0]), + 242: + dict(link=('lsd_kpt21', 'lsd_kpt22'), id=242, color=[128, 64, 0]), + 243: + dict(link=('lsd_kpt22', 'lsd_kpt23'), id=243, color=[128, 64, 0]), + 244: + dict(link=('lsd_kpt23', 'lsd_kpt24'), id=244, color=[128, 64, 0]), + 245: + dict(link=('lsd_kpt24', 'lsd_kpt25'), id=245, color=[128, 64, 0]), + 246: + dict(link=('lsd_kpt25', 'lsd_kpt26'), id=246, color=[128, 64, 0]), + 247: + dict(link=('lsd_kpt26', 'lsd_kpt27'), id=247, color=[128, 64, 0]), + 248: + dict(link=('lsd_kpt27', 'lsd_kpt28'), id=248, color=[128, 64, 0]), + 249: + dict(link=('lsd_kpt28', 'lsd_kpt29'), id=249, color=[128, 64, 0]), + 250: + dict(link=('lsd_kpt29', 'lsd_kpt30'), id=250, color=[128, 64, 0]), + 251: + dict(link=('lsd_kpt30', 'lsd_kpt31'), id=251, color=[128, 64, 0]), + 252: + dict(link=('lsd_kpt31', 'lsd_kpt32'), id=252, color=[128, 64, 0]), + 253: + dict(link=('lsd_kpt32', 'lsd_kpt33'), id=253, color=[128, 64, 0]), + 254: + dict(link=('lsd_kpt33', 'lsd_kpt34'), id=254, color=[128, 64, 0]), + 255: + dict(link=('lsd_kpt34', 'lsd_kpt35'), id=255, color=[128, 64, 0]), + 256: + dict(link=('lsd_kpt35', 'lsd_kpt36'), id=256, color=[128, 64, 0]), + 257: + dict(link=('lsd_kpt36', 'lsd_kpt37'), id=257, color=[128, 64, 0]), + 258: + dict(link=('lsd_kpt37', 'lsd_kpt6'), id=258, color=[128, 64, 0]), + 259: + dict(link=('lsd_kpt6', 'lsd_kpt5'), id=259, color=[128, 64, 0]), + 260: + dict(link=('lsd_kpt5', 'lsd_kpt4'), id=260, color=[128, 64, 0]), + 261: + dict(link=('lsd_kpt4', 'lsd_kpt3'), id=261, color=[128, 64, 0]), + 262: + dict(link=('lsd_kpt3', 'lsd_kpt2'), id=262, color=[128, 64, 0]), + 263: + dict(link=('lsd_kpt6', 'lsd_kpt1'), id=263, color=[128, 64, 0]), + # vest_dress + 264: + dict(link=('vd_kpt1', 'vd_kpt2'), id=264, color=[128, 64, 255]), + 265: + dict(link=('vd_kpt2', 'vd_kpt7'), id=265, color=[128, 64, 255]), + 266: + dict(link=('vd_kpt7', 'vd_kpt8'), id=266, color=[128, 64, 255]), + 267: + dict(link=('vd_kpt8', 'vd_kpt9'), id=267, color=[128, 64, 255]), + 268: + dict(link=('vd_kpt9', 'vd_kpt10'), id=268, color=[128, 64, 255]), + 269: + dict(link=('vd_kpt10', 'vd_kpt11'), id=269, color=[128, 64, 255]), + 270: + dict(link=('vd_kpt11', 'vd_kpt12'), id=270, color=[128, 64, 255]), + 271: + dict(link=('vd_kpt12', 'vd_kpt13'), id=271, color=[128, 64, 255]), + 272: + dict(link=('vd_kpt13', 'vd_kpt14'), id=272, color=[128, 64, 255]), + 273: + dict(link=('vd_kpt14', 'vd_kpt15'), id=273, color=[128, 64, 255]), + 274: + dict(link=('vd_kpt15', 'vd_kpt16'), id=274, color=[128, 64, 255]), + 275: + dict(link=('vd_kpt16', 'vd_kpt17'), id=275, color=[128, 64, 255]), + 276: + dict(link=('vd_kpt17', 'vd_kpt18'), id=276, color=[128, 64, 255]), + 277: + dict(link=('vd_kpt18', 'vd_kpt19'), id=277, color=[128, 64, 255]), + 278: + dict(link=('vd_kpt19', 'vd_kpt6'), id=278, color=[128, 64, 255]), + 279: + dict(link=('vd_kpt6', 'vd_kpt5'), id=279, color=[128, 64, 255]), + 280: + dict(link=('vd_kpt5', 'vd_kpt4'), id=280, color=[128, 64, 255]), + 281: + dict(link=('vd_kpt4', 'vd_kpt3'), id=281, color=[128, 64, 255]), + 282: + dict(link=('vd_kpt3', 'vd_kpt2'), id=282, color=[128, 64, 255]), + 283: + dict(link=('vd_kpt6', 'vd_kpt1'), id=283, color=[128, 64, 255]), + # sling_dress + 284: + dict(link=('sd_kpt1', 'sd_kpt2'), id=284, color=[128, 64, 0]), + 285: + dict(link=('sd_kpt2', 'sd_kpt8'), id=285, color=[128, 64, 0]), + 286: + dict(link=('sd_kpt8', 'sd_kpt9'), id=286, color=[128, 64, 0]), + 287: + dict(link=('sd_kpt9', 'sd_kpt10'), id=287, color=[128, 64, 0]), + 288: + dict(link=('sd_kpt10', 'sd_kpt11'), id=288, color=[128, 64, 0]), + 289: + dict(link=('sd_kpt11', 'sd_kpt12'), id=289, color=[128, 64, 0]), + 290: + dict(link=('sd_kpt12', 'sd_kpt13'), id=290, color=[128, 64, 0]), + 291: + dict(link=('sd_kpt13', 'sd_kpt14'), id=291, color=[128, 64, 0]), + 292: + dict(link=('sd_kpt14', 'sd_kpt15'), id=292, color=[128, 64, 0]), + 293: + dict(link=('sd_kpt15', 'sd_kpt16'), id=293, color=[128, 64, 0]), + 294: + dict(link=('sd_kpt16', 'sd_kpt17'), id=294, color=[128, 64, 0]), + 295: + dict(link=('sd_kpt17', 'sd_kpt18'), id=295, color=[128, 64, 0]), + 296: + dict(link=('sd_kpt18', 'sd_kpt6'), id=296, color=[128, 64, 0]), + 297: + dict(link=('sd_kpt6', 'sd_kpt5'), id=297, color=[128, 64, 0]), + 298: + dict(link=('sd_kpt5', 'sd_kpt4'), id=298, color=[128, 64, 0]), + 299: + dict(link=('sd_kpt4', 'sd_kpt3'), id=299, color=[128, 64, 0]), + 300: + dict(link=('sd_kpt3', 'sd_kpt2'), id=300, color=[128, 64, 0]), + 301: + dict(link=('sd_kpt2', 'sd_kpt7'), id=301, color=[128, 64, 0]), + 302: + dict(link=('sd_kpt6', 'sd_kpt19'), id=302, color=[128, 64, 0]), + 303: + dict(link=('sd_kpt6', 'sd_kpt1'), id=303, color=[128, 64, 0]), + }, + joint_weights=[1.] * 294, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/deepfashion_full.py b/modules/rtmpose/configs/_base_/datasets/deepfashion_full.py new file mode 100644 index 0000000..9769127 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/deepfashion_full.py @@ -0,0 +1,74 @@ +dataset_info = dict( + dataset_name='deepfashion_full', + paper_info=dict( + author='Liu, Ziwei and Luo, Ping and Qiu, Shi ' + 'and Wang, Xiaogang and Tang, Xiaoou', + title='DeepFashion: Powering Robust Clothes Recognition ' + 'and Retrieval with Rich Annotations', + container='Proceedings of IEEE Conference on Computer ' + 'Vision and Pattern Recognition (CVPR)', + year='2016', + homepage='http://mmlab.ie.cuhk.edu.hk/projects/' + 'DeepFashion/LandmarkDetection.html', + ), + keypoint_info={ + 0: + dict( + name='left collar', + id=0, + color=[255, 255, 255], + type='', + swap='right collar'), + 1: + dict( + name='right collar', + id=1, + color=[255, 255, 255], + type='', + swap='left collar'), + 2: + dict( + name='left sleeve', + id=2, + color=[255, 255, 255], + type='', + swap='right sleeve'), + 3: + dict( + name='right sleeve', + id=3, + color=[255, 255, 255], + type='', + swap='left sleeve'), + 4: + dict( + name='left waistline', + id=0, + color=[255, 255, 255], + type='', + swap='right waistline'), + 5: + dict( + name='right waistline', + id=1, + color=[255, 255, 255], + type='', + swap='left waistline'), + 6: + dict( + name='left hem', + id=2, + color=[255, 255, 255], + type='', + swap='right hem'), + 7: + dict( + name='right hem', + id=3, + color=[255, 255, 255], + type='', + swap='left hem'), + }, + skeleton_info={}, + joint_weights=[1.] * 8, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/deepfashion_lower.py b/modules/rtmpose/configs/_base_/datasets/deepfashion_lower.py new file mode 100644 index 0000000..65995e1 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/deepfashion_lower.py @@ -0,0 +1,46 @@ +dataset_info = dict( + dataset_name='deepfashion_lower', + paper_info=dict( + author='Liu, Ziwei and Luo, Ping and Qiu, Shi ' + 'and Wang, Xiaogang and Tang, Xiaoou', + title='DeepFashion: Powering Robust Clothes Recognition ' + 'and Retrieval with Rich Annotations', + container='Proceedings of IEEE Conference on Computer ' + 'Vision and Pattern Recognition (CVPR)', + year='2016', + homepage='http://mmlab.ie.cuhk.edu.hk/projects/' + 'DeepFashion/LandmarkDetection.html', + ), + keypoint_info={ + 0: + dict( + name='left waistline', + id=0, + color=[255, 255, 255], + type='', + swap='right waistline'), + 1: + dict( + name='right waistline', + id=1, + color=[255, 255, 255], + type='', + swap='left waistline'), + 2: + dict( + name='left hem', + id=2, + color=[255, 255, 255], + type='', + swap='right hem'), + 3: + dict( + name='right hem', + id=3, + color=[255, 255, 255], + type='', + swap='left hem'), + }, + skeleton_info={}, + joint_weights=[1.] * 4, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/deepfashion_upper.py b/modules/rtmpose/configs/_base_/datasets/deepfashion_upper.py new file mode 100644 index 0000000..4f34e2a --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/deepfashion_upper.py @@ -0,0 +1,60 @@ +dataset_info = dict( + dataset_name='deepfashion_upper', + paper_info=dict( + author='Liu, Ziwei and Luo, Ping and Qiu, Shi ' + 'and Wang, Xiaogang and Tang, Xiaoou', + title='DeepFashion: Powering Robust Clothes Recognition ' + 'and Retrieval with Rich Annotations', + container='Proceedings of IEEE Conference on Computer ' + 'Vision and Pattern Recognition (CVPR)', + year='2016', + homepage='http://mmlab.ie.cuhk.edu.hk/projects/' + 'DeepFashion/LandmarkDetection.html', + ), + keypoint_info={ + 0: + dict( + name='left collar', + id=0, + color=[255, 255, 255], + type='', + swap='right collar'), + 1: + dict( + name='right collar', + id=1, + color=[255, 255, 255], + type='', + swap='left collar'), + 2: + dict( + name='left sleeve', + id=2, + color=[255, 255, 255], + type='', + swap='right sleeve'), + 3: + dict( + name='right sleeve', + id=3, + color=[255, 255, 255], + type='', + swap='left sleeve'), + 4: + dict( + name='left hem', + id=4, + color=[255, 255, 255], + type='', + swap='right hem'), + 5: + dict( + name='right hem', + id=5, + color=[255, 255, 255], + type='', + swap='left hem'), + }, + skeleton_info={}, + joint_weights=[1.] * 6, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/exlpose.py b/modules/rtmpose/configs/_base_/datasets/exlpose.py new file mode 100644 index 0000000..0a27a74 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/exlpose.py @@ -0,0 +1,125 @@ +dataset_info = dict( + dataset_name='exlpose', + paper_info=dict( + author='Sohyun Lee, Jaesung Rim, Boseung Jeong, Geonu Kim,' + 'ByungJu Woo, Haechan Lee, Sunghyun Cho, Suha Kwak', + title='Human Pose Estimation in Extremely Low-Light Conditions', + container='arXiv', + year='2023', + homepage='https://arxiv.org/abs/2303.15410', + ), + keypoint_info={ + 0: + dict( + name='left_shoulder', + id=0, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 1: + dict( + name='right_shoulder', + id=1, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 2: + dict( + name='left_elbow', + id=2, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 3: + dict( + name='right_elbow', + id=3, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 4: + dict( + name='left_wrist', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 5: + dict( + name='right_wrist', + id=5, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 6: + dict( + name='left_hip', + id=6, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 7: + dict( + name='right_hip', + id=7, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 8: + dict( + name='left_knee', + id=8, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 9: + dict( + name='right_knee', + id=9, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 10: + dict( + name='left_ankle', + id=10, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 11: + dict( + name='right_ankle', + id=11, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 12: + dict(name='head', id=12, color=[51, 153, 255], type='upper', swap=''), + 13: + dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: dict(link=('head', 'neck'), id=0, color=[51, 153, 255]), + 1: dict(link=('neck', 'left_shoulder'), id=1, color=[51, 153, 255]), + 2: dict(link=('neck', 'right_shoulder'), id=2, color=[51, 153, 255]), + 3: dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]), + 4: dict(link=('left_elbow', 'left_wrist'), id=4, color=[0, 255, 0]), + 5: dict( + link=('right_shoulder', 'right_elbow'), id=5, color=[255, 128, 0]), + 6: + dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]), + 7: dict(link=('neck', 'right_hip'), id=7, color=[51, 153, 255]), + 8: dict(link=('neck', 'left_hip'), id=8, color=[51, 153, 255]), + 9: dict(link=('right_hip', 'right_knee'), id=9, color=[255, 128, 0]), + 10: + dict(link=('right_knee', 'right_ankle'), id=10, color=[255, 128, 0]), + 11: dict(link=('left_hip', 'left_knee'), id=11, color=[0, 255, 0]), + 12: dict(link=('left_knee', 'left_ankle'), id=12, color=[0, 255, 0]), + }, + joint_weights=[ + 0.2, 0.2, 0.2, 1.3, 1.5, 0.2, 1.3, 1.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.5 + ], + sigmas=[ + 0.079, 0.079, 0.072, 0.072, 0.062, 0.062, 0.107, 0.107, 0.087, 0.087, + 0.089, 0.089, 0.079, 0.079 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/fly.py b/modules/rtmpose/configs/_base_/datasets/fly.py new file mode 100644 index 0000000..46386b6 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/fly.py @@ -0,0 +1,237 @@ +dataset_info = dict( + dataset_name='fly', + paper_info=dict( + author='Pereira, Talmo D and Aldarondo, Diego E and ' + 'Willmore, Lindsay and Kislin, Mikhail and ' + 'Wang, Samuel S-H and Murthy, Mala and Shaevitz, Joshua W', + title='Fast animal pose estimation using deep neural networks', + container='Nature methods', + year='2019', + homepage='https://github.com/jgraving/DeepPoseKit-Data', + ), + keypoint_info={ + 0: + dict(name='head', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='eyeL', id=1, color=[255, 255, 255], type='', swap='eyeR'), + 2: + dict(name='eyeR', id=2, color=[255, 255, 255], type='', swap='eyeL'), + 3: + dict(name='neck', id=3, color=[255, 255, 255], type='', swap=''), + 4: + dict(name='thorax', id=4, color=[255, 255, 255], type='', swap=''), + 5: + dict(name='abdomen', id=5, color=[255, 255, 255], type='', swap=''), + 6: + dict( + name='forelegR1', + id=6, + color=[255, 255, 255], + type='', + swap='forelegL1'), + 7: + dict( + name='forelegR2', + id=7, + color=[255, 255, 255], + type='', + swap='forelegL2'), + 8: + dict( + name='forelegR3', + id=8, + color=[255, 255, 255], + type='', + swap='forelegL3'), + 9: + dict( + name='forelegR4', + id=9, + color=[255, 255, 255], + type='', + swap='forelegL4'), + 10: + dict( + name='midlegR1', + id=10, + color=[255, 255, 255], + type='', + swap='midlegL1'), + 11: + dict( + name='midlegR2', + id=11, + color=[255, 255, 255], + type='', + swap='midlegL2'), + 12: + dict( + name='midlegR3', + id=12, + color=[255, 255, 255], + type='', + swap='midlegL3'), + 13: + dict( + name='midlegR4', + id=13, + color=[255, 255, 255], + type='', + swap='midlegL4'), + 14: + dict( + name='hindlegR1', + id=14, + color=[255, 255, 255], + type='', + swap='hindlegL1'), + 15: + dict( + name='hindlegR2', + id=15, + color=[255, 255, 255], + type='', + swap='hindlegL2'), + 16: + dict( + name='hindlegR3', + id=16, + color=[255, 255, 255], + type='', + swap='hindlegL3'), + 17: + dict( + name='hindlegR4', + id=17, + color=[255, 255, 255], + type='', + swap='hindlegL4'), + 18: + dict( + name='forelegL1', + id=18, + color=[255, 255, 255], + type='', + swap='forelegR1'), + 19: + dict( + name='forelegL2', + id=19, + color=[255, 255, 255], + type='', + swap='forelegR2'), + 20: + dict( + name='forelegL3', + id=20, + color=[255, 255, 255], + type='', + swap='forelegR3'), + 21: + dict( + name='forelegL4', + id=21, + color=[255, 255, 255], + type='', + swap='forelegR4'), + 22: + dict( + name='midlegL1', + id=22, + color=[255, 255, 255], + type='', + swap='midlegR1'), + 23: + dict( + name='midlegL2', + id=23, + color=[255, 255, 255], + type='', + swap='midlegR2'), + 24: + dict( + name='midlegL3', + id=24, + color=[255, 255, 255], + type='', + swap='midlegR3'), + 25: + dict( + name='midlegL4', + id=25, + color=[255, 255, 255], + type='', + swap='midlegR4'), + 26: + dict( + name='hindlegL1', + id=26, + color=[255, 255, 255], + type='', + swap='hindlegR1'), + 27: + dict( + name='hindlegL2', + id=27, + color=[255, 255, 255], + type='', + swap='hindlegR2'), + 28: + dict( + name='hindlegL3', + id=28, + color=[255, 255, 255], + type='', + swap='hindlegR3'), + 29: + dict( + name='hindlegL4', + id=29, + color=[255, 255, 255], + type='', + swap='hindlegR4'), + 30: + dict( + name='wingL', id=30, color=[255, 255, 255], type='', swap='wingR'), + 31: + dict( + name='wingR', id=31, color=[255, 255, 255], type='', swap='wingL'), + }, + skeleton_info={ + 0: dict(link=('eyeL', 'head'), id=0, color=[255, 255, 255]), + 1: dict(link=('eyeR', 'head'), id=1, color=[255, 255, 255]), + 2: dict(link=('neck', 'head'), id=2, color=[255, 255, 255]), + 3: dict(link=('thorax', 'neck'), id=3, color=[255, 255, 255]), + 4: dict(link=('abdomen', 'thorax'), id=4, color=[255, 255, 255]), + 5: dict(link=('forelegR2', 'forelegR1'), id=5, color=[255, 255, 255]), + 6: dict(link=('forelegR3', 'forelegR2'), id=6, color=[255, 255, 255]), + 7: dict(link=('forelegR4', 'forelegR3'), id=7, color=[255, 255, 255]), + 8: dict(link=('midlegR2', 'midlegR1'), id=8, color=[255, 255, 255]), + 9: dict(link=('midlegR3', 'midlegR2'), id=9, color=[255, 255, 255]), + 10: dict(link=('midlegR4', 'midlegR3'), id=10, color=[255, 255, 255]), + 11: + dict(link=('hindlegR2', 'hindlegR1'), id=11, color=[255, 255, 255]), + 12: + dict(link=('hindlegR3', 'hindlegR2'), id=12, color=[255, 255, 255]), + 13: + dict(link=('hindlegR4', 'hindlegR3'), id=13, color=[255, 255, 255]), + 14: + dict(link=('forelegL2', 'forelegL1'), id=14, color=[255, 255, 255]), + 15: + dict(link=('forelegL3', 'forelegL2'), id=15, color=[255, 255, 255]), + 16: + dict(link=('forelegL4', 'forelegL3'), id=16, color=[255, 255, 255]), + 17: dict(link=('midlegL2', 'midlegL1'), id=17, color=[255, 255, 255]), + 18: dict(link=('midlegL3', 'midlegL2'), id=18, color=[255, 255, 255]), + 19: dict(link=('midlegL4', 'midlegL3'), id=19, color=[255, 255, 255]), + 20: + dict(link=('hindlegL2', 'hindlegL1'), id=20, color=[255, 255, 255]), + 21: + dict(link=('hindlegL3', 'hindlegL2'), id=21, color=[255, 255, 255]), + 22: + dict(link=('hindlegL4', 'hindlegL3'), id=22, color=[255, 255, 255]), + 23: dict(link=('wingL', 'neck'), id=23, color=[255, 255, 255]), + 24: dict(link=('wingR', 'neck'), id=24, color=[255, 255, 255]) + }, + joint_weights=[1.] * 32, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/freihand2d.py b/modules/rtmpose/configs/_base_/datasets/freihand2d.py new file mode 100644 index 0000000..ae04742 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/freihand2d.py @@ -0,0 +1,144 @@ +dataset_info = dict( + dataset_name='freihand', + paper_info=dict( + author='Zimmermann, Christian and Ceylan, Duygu and ' + 'Yang, Jimei and Russell, Bryan and ' + 'Argus, Max and Brox, Thomas', + title='Freihand: A dataset for markerless capture of hand pose ' + 'and shape from single rgb images', + container='Proceedings of the IEEE International ' + 'Conference on Computer Vision', + year='2019', + homepage='https://lmb.informatik.uni-freiburg.de/projects/freihand/', + ), + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger1', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger3', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger4', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/h36m.py b/modules/rtmpose/configs/_base_/datasets/h36m.py new file mode 100644 index 0000000..f6be31f --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/h36m.py @@ -0,0 +1,152 @@ +dataset_info = dict( + dataset_name='h36m', + paper_info=dict( + author='Ionescu, Catalin and Papava, Dragos and ' + 'Olaru, Vlad and Sminchisescu, Cristian', + title='Human3.6M: Large Scale Datasets and Predictive ' + 'Methods for 3D Human Sensing in Natural Environments', + container='IEEE Transactions on Pattern Analysis and ' + 'Machine Intelligence', + year='2014', + homepage='http://vision.imar.ro/human3.6m/description.php', + ), + keypoint_info={ + 0: + dict(name='root', id=0, color=[51, 153, 255], type='lower', swap=''), + 1: + dict( + name='right_hip', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 2: + dict( + name='right_knee', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 3: + dict( + name='right_foot', + id=3, + color=[255, 128, 0], + type='lower', + swap='left_foot'), + 4: + dict( + name='left_hip', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 5: + dict( + name='left_knee', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 6: + dict( + name='left_foot', + id=6, + color=[0, 255, 0], + type='lower', + swap='right_foot'), + 7: + dict(name='spine', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict(name='thorax', id=8, color=[51, 153, 255], type='upper', swap=''), + 9: + dict( + name='neck_base', + id=9, + color=[51, 153, 255], + type='upper', + swap=''), + 10: + dict(name='head', id=10, color=[51, 153, 255], type='upper', swap=''), + 11: + dict( + name='left_shoulder', + id=11, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 12: + dict( + name='left_elbow', + id=12, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 13: + dict( + name='left_wrist', + id=13, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 14: + dict( + name='right_shoulder', + id=14, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 15: + dict( + name='right_elbow', + id=15, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 16: + dict( + name='right_wrist', + id=16, + color=[255, 128, 0], + type='upper', + swap='left_wrist') + }, + skeleton_info={ + 0: + dict(link=('root', 'left_hip'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_hip', 'left_knee'), id=1, color=[0, 255, 0]), + 2: + dict(link=('left_knee', 'left_foot'), id=2, color=[0, 255, 0]), + 3: + dict(link=('root', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('right_hip', 'right_knee'), id=4, color=[255, 128, 0]), + 5: + dict(link=('right_knee', 'right_foot'), id=5, color=[255, 128, 0]), + 6: + dict(link=('root', 'spine'), id=6, color=[51, 153, 255]), + 7: + dict(link=('spine', 'thorax'), id=7, color=[51, 153, 255]), + 8: + dict(link=('thorax', 'neck_base'), id=8, color=[51, 153, 255]), + 9: + dict(link=('neck_base', 'head'), id=9, color=[51, 153, 255]), + 10: + dict(link=('thorax', 'left_shoulder'), id=10, color=[0, 255, 0]), + 11: + dict(link=('left_shoulder', 'left_elbow'), id=11, color=[0, 255, 0]), + 12: + dict(link=('left_elbow', 'left_wrist'), id=12, color=[0, 255, 0]), + 13: + dict(link=('thorax', 'right_shoulder'), id=13, color=[255, 128, 0]), + 14: + dict( + link=('right_shoulder', 'right_elbow'), id=14, color=[255, 128, + 0]), + 15: + dict(link=('right_elbow', 'right_wrist'), id=15, color=[255, 128, 0]) + }, + joint_weights=[1.] * 17, + sigmas=[], + stats_info=dict(bbox_center=(528., 427.), bbox_scale=400.)) diff --git a/modules/rtmpose/configs/_base_/datasets/h3wb.py b/modules/rtmpose/configs/_base_/datasets/h3wb.py new file mode 100644 index 0000000..24e3b4e --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/h3wb.py @@ -0,0 +1,1151 @@ +dataset_info = dict( + dataset_name='h3wb', + paper_info=dict( + author='Yue Zhu, Nermin Samet, David Picard', + title='H3WB: Human3.6M 3D WholeBody Dataset and Benchmark', + container='International Conf. on Computer Vision (ICCV)', + year='2023', + homepage='https://github.com/wholebody3d/wholebody3d', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='left_big_toe', + id=17, + color=[255, 128, 0], + type='lower', + swap='right_big_toe'), + 18: + dict( + name='left_small_toe', + id=18, + color=[255, 128, 0], + type='lower', + swap='right_small_toe'), + 19: + dict( + name='left_heel', + id=19, + color=[255, 128, 0], + type='lower', + swap='right_heel'), + 20: + dict( + name='right_big_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='left_big_toe'), + 21: + dict( + name='right_small_toe', + id=21, + color=[255, 128, 0], + type='lower', + swap='left_small_toe'), + 22: + dict( + name='right_heel', + id=22, + color=[255, 128, 0], + type='lower', + swap='left_heel'), + 23: + dict( + name='face-0', + id=23, + color=[247, 34, 5], + type='upper', + swap='face-16'), + 24: + dict( + name='face-1', + id=24, + color=[247, 34, 5], + type='upper', + swap='face-15'), + 25: + dict( + name='face-2', + id=25, + color=[247, 34, 5], + type='upper', + swap='face-14'), + 26: + dict( + name='face-3', + id=26, + color=[247, 34, 5], + type='upper', + swap='face-13'), + 27: + dict( + name='face-4', + id=27, + color=[247, 34, 5], + type='upper', + swap='face-12'), + 28: + dict( + name='face-5', + id=28, + color=[247, 34, 5], + type='upper', + swap='face-11'), + 29: + dict( + name='face-6', + id=29, + color=[247, 34, 5], + type='upper', + swap='face-10'), + 30: + dict( + name='face-7', + id=30, + color=[247, 34, 5], + type='upper', + swap='face-9'), + 31: + dict(name='face-8', id=31, color=[247, 34, 5], type='upper', swap=''), + 32: + dict( + name='face-9', + id=32, + color=[247, 34, 5], + type='upper', + swap='face-7'), + 33: + dict( + name='face-10', + id=33, + color=[247, 34, 5], + type='upper', + swap='face-6'), + 34: + dict( + name='face-11', + id=34, + color=[247, 34, 5], + type='upper', + swap='face-5'), + 35: + dict( + name='face-12', + id=35, + color=[247, 34, 5], + type='upper', + swap='face-4'), + 36: + dict( + name='face-13', + id=36, + color=[247, 34, 5], + type='upper', + swap='face-3'), + 37: + dict( + name='face-14', + id=37, + color=[247, 34, 5], + type='upper', + swap='face-2'), + 38: + dict( + name='face-15', + id=38, + color=[247, 34, 5], + type='upper', + swap='face-1'), + 39: + dict( + name='face-16', + id=39, + color=[247, 34, 5], + type='upper', + swap='face-0'), + 40: + dict( + name='face-17', + id=40, + color=[247, 34, 5], + type='upper', + swap='face-26'), + 41: + dict( + name='face-18', + id=41, + color=[247, 34, 5], + type='upper', + swap='face-25'), + 42: + dict( + name='face-19', + id=42, + color=[247, 34, 5], + type='upper', + swap='face-24'), + 43: + dict( + name='face-20', + id=43, + color=[247, 34, 5], + type='upper', + swap='face-23'), + 44: + dict( + name='face-21', + id=44, + color=[247, 34, 5], + type='upper', + swap='face-22'), + 45: + dict( + name='face-22', + id=45, + color=[247, 34, 5], + type='upper', + swap='face-21'), + 46: + dict( + name='face-23', + id=46, + color=[247, 34, 5], + type='upper', + swap='face-20'), + 47: + dict( + name='face-24', + id=47, + color=[247, 34, 5], + type='upper', + swap='face-19'), + 48: + dict( + name='face-25', + id=48, + color=[247, 34, 5], + type='upper', + swap='face-18'), + 49: + dict( + name='face-26', + id=49, + color=[247, 34, 5], + type='upper', + swap='face-17'), + 50: + dict(name='face-27', id=50, color=[247, 34, 5], type='upper', swap=''), + 51: + dict(name='face-28', id=51, color=[247, 34, 5], type='upper', swap=''), + 52: + dict(name='face-29', id=52, color=[247, 34, 5], type='upper', swap=''), + 53: + dict(name='face-30', id=53, color=[247, 34, 5], type='upper', swap=''), + 54: + dict( + name='face-31', + id=54, + color=[247, 34, 5], + type='upper', + swap='face-35'), + 55: + dict( + name='face-32', + id=55, + color=[247, 34, 5], + type='upper', + swap='face-34'), + 56: + dict(name='face-33', id=56, color=[247, 34, 5], type='upper', swap=''), + 57: + dict( + name='face-34', + id=57, + color=[247, 34, 5], + type='upper', + swap='face-32'), + 58: + dict( + name='face-35', + id=58, + color=[247, 34, 5], + type='upper', + swap='face-31'), + 59: + dict( + name='face-36', + id=59, + color=[247, 34, 5], + type='upper', + swap='face-45'), + 60: + dict( + name='face-37', + id=60, + color=[247, 34, 5], + type='upper', + swap='face-44'), + 61: + dict( + name='face-38', + id=61, + color=[247, 34, 5], + type='upper', + swap='face-43'), + 62: + dict( + name='face-39', + id=62, + color=[247, 34, 5], + type='upper', + swap='face-42'), + 63: + dict( + name='face-40', + id=63, + color=[247, 34, 5], + type='upper', + swap='face-47'), + 64: + dict( + name='face-41', + id=64, + color=[247, 34, 5], + type='upper', + swap='face-46'), + 65: + dict( + name='face-42', + id=65, + color=[247, 34, 5], + type='upper', + swap='face-39'), + 66: + dict( + name='face-43', + id=66, + color=[247, 34, 5], + type='upper', + swap='face-38'), + 67: + dict( + name='face-44', + id=67, + color=[247, 34, 5], + type='upper', + swap='face-37'), + 68: + dict( + name='face-45', + id=68, + color=[247, 34, 5], + type='upper', + swap='face-36'), + 69: + dict( + name='face-46', + id=69, + color=[247, 34, 5], + type='upper', + swap='face-41'), + 70: + dict( + name='face-47', + id=70, + color=[247, 34, 5], + type='upper', + swap='face-40'), + 71: + dict( + name='face-48', + id=71, + color=[247, 34, 5], + type='upper', + swap='face-54'), + 72: + dict( + name='face-49', + id=72, + color=[247, 34, 5], + type='upper', + swap='face-53'), + 73: + dict( + name='face-50', + id=73, + color=[247, 34, 5], + type='upper', + swap='face-52'), + 74: + dict(name='face-51', id=74, color=[247, 34, 5], type='upper', swap=''), + 75: + dict( + name='face-52', + id=75, + color=[247, 34, 5], + type='upper', + swap='face-50'), + 76: + dict( + name='face-53', + id=76, + color=[247, 34, 5], + type='upper', + swap='face-49'), + 77: + dict( + name='face-54', + id=77, + color=[247, 34, 5], + type='upper', + swap='face-48'), + 78: + dict( + name='face-55', + id=78, + color=[247, 34, 5], + type='upper', + swap='face-59'), + 79: + dict( + name='face-56', + id=79, + color=[247, 34, 5], + type='upper', + swap='face-58'), + 80: + dict(name='face-57', id=80, color=[247, 34, 5], type='upper', swap=''), + 81: + dict( + name='face-58', + id=81, + color=[247, 34, 5], + type='upper', + swap='face-56'), + 82: + dict( + name='face-59', + id=82, + color=[247, 34, 5], + type='upper', + swap='face-55'), + 83: + dict( + name='face-60', + id=83, + color=[247, 34, 5], + type='upper', + swap='face-64'), + 84: + dict( + name='face-61', + id=84, + color=[247, 34, 5], + type='upper', + swap='face-63'), + 85: + dict(name='face-62', id=85, color=[247, 34, 5], type='upper', swap=''), + 86: + dict( + name='face-63', + id=86, + color=[247, 34, 5], + type='upper', + swap='face-61'), + 87: + dict( + name='face-64', + id=87, + color=[247, 34, 5], + type='upper', + swap='face-60'), + 88: + dict( + name='face-65', + id=88, + color=[247, 34, 5], + type='upper', + swap='face-67'), + 89: + dict(name='face-66', id=89, color=[247, 34, 5], type='upper', swap=''), + 90: + dict( + name='face-67', + id=90, + color=[247, 34, 5], + type='upper', + swap='face-65'), + 91: + dict( + name='left_hand_root', + id=91, + color=[247, 34, 5], + type='', + swap='right_hand_root'), + 92: + dict( + name='left_thumb1', + id=92, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 93: + dict( + name='left_thumb2', + id=93, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 94: + dict( + name='left_thumb3', + id=94, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 95: + dict( + name='left_thumb4', + id=95, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 96: + dict( + name='left_forefinger1', + id=96, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 97: + dict( + name='left_forefinger2', + id=97, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 98: + dict( + name='left_forefinger3', + id=98, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 99: + dict( + name='left_forefinger4', + id=99, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 100: + dict( + name='left_middle_finger1', + id=100, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 101: + dict( + name='left_middle_finger2', + id=101, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 102: + dict( + name='left_middle_finger3', + id=102, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 103: + dict( + name='left_middle_finger4', + id=103, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 104: + dict( + name='left_ring_finger1', + id=104, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 105: + dict( + name='left_ring_finger2', + id=105, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 106: + dict( + name='left_ring_finger3', + id=106, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 107: + dict( + name='left_ring_finger4', + id=107, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 108: + dict( + name='left_pinky_finger1', + id=108, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 109: + dict( + name='left_pinky_finger2', + id=109, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 110: + dict( + name='left_pinky_finger3', + id=110, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 111: + dict( + name='left_pinky_finger4', + id=111, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 112: + dict( + name='right_hand_root', + id=112, + color=[247, 34, 5], + type='', + swap='left_hand_root'), + 113: + dict( + name='right_thumb1', + id=113, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 114: + dict( + name='right_thumb2', + id=114, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 115: + dict( + name='right_thumb3', + id=115, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 116: + dict( + name='right_thumb4', + id=116, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 117: + dict( + name='right_forefinger1', + id=117, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 118: + dict( + name='right_forefinger2', + id=118, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 119: + dict( + name='right_forefinger3', + id=119, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 120: + dict( + name='right_forefinger4', + id=120, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 121: + dict( + name='right_middle_finger1', + id=121, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 122: + dict( + name='right_middle_finger2', + id=122, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 123: + dict( + name='right_middle_finger3', + id=123, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 124: + dict( + name='right_middle_finger4', + id=124, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 125: + dict( + name='right_ring_finger1', + id=125, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 126: + dict( + name='right_ring_finger2', + id=126, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 127: + dict( + name='right_ring_finger3', + id=127, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 128: + dict( + name='right_ring_finger4', + id=128, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 129: + dict( + name='right_pinky_finger1', + id=129, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 130: + dict( + name='right_pinky_finger2', + id=130, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 131: + dict( + name='right_pinky_finger3', + id=131, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 132: + dict( + name='right_pinky_finger4', + id=132, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]), + 20: + dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]), + 21: + dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]), + 22: + dict( + link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]), + 23: + dict( + link=('right_ankle', 'right_small_toe'), + id=23, + color=[255, 128, 0]), + 24: + dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128, + 0]), + 26: + dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]), + 27: + dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]), + 28: + dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]), + 29: + dict( + link=('left_hand_root', 'left_forefinger1'), + id=29, + color=[255, 153, 255]), + 30: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=30, + color=[255, 153, 255]), + 31: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=31, + color=[255, 153, 255]), + 32: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=32, + color=[255, 153, 255]), + 33: + dict( + link=('left_hand_root', 'left_middle_finger1'), + id=33, + color=[102, 178, 255]), + 34: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=34, + color=[102, 178, 255]), + 35: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=35, + color=[102, 178, 255]), + 36: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=36, + color=[102, 178, 255]), + 37: + dict( + link=('left_hand_root', 'left_ring_finger1'), + id=37, + color=[255, 51, 51]), + 38: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=38, + color=[255, 51, 51]), + 39: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=39, + color=[255, 51, 51]), + 40: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=40, + color=[255, 51, 51]), + 41: + dict( + link=('left_hand_root', 'left_pinky_finger1'), + id=41, + color=[0, 255, 0]), + 42: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=42, + color=[0, 255, 0]), + 43: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=43, + color=[0, 255, 0]), + 44: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=44, + color=[0, 255, 0]), + 45: + dict( + link=('right_hand_root', 'right_thumb1'), + id=45, + color=[255, 128, 0]), + 46: + dict( + link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]), + 47: + dict( + link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]), + 48: + dict( + link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]), + 49: + dict( + link=('right_hand_root', 'right_forefinger1'), + id=49, + color=[255, 153, 255]), + 50: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=50, + color=[255, 153, 255]), + 51: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=51, + color=[255, 153, 255]), + 52: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=52, + color=[255, 153, 255]), + 53: + dict( + link=('right_hand_root', 'right_middle_finger1'), + id=53, + color=[102, 178, 255]), + 54: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=54, + color=[102, 178, 255]), + 55: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=55, + color=[102, 178, 255]), + 56: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=56, + color=[102, 178, 255]), + 57: + dict( + link=('right_hand_root', 'right_ring_finger1'), + id=57, + color=[255, 51, 51]), + 58: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=58, + color=[255, 51, 51]), + 59: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=59, + color=[255, 51, 51]), + 60: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=60, + color=[255, 51, 51]), + 61: + dict( + link=('right_hand_root', 'right_pinky_finger1'), + id=61, + color=[0, 255, 0]), + 62: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=62, + color=[0, 255, 0]), + 63: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=63, + color=[0, 255, 0]), + 64: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=64, + color=[0, 255, 0]) + }, + joint_weights=[1.] * 133, + # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/' + # 'evaluation/myeval_wholebody.py#L175' + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066, + 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, + 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, + 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, + 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, + 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, + 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, + 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, + 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, + 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, + 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, + 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, + 0.019, 0.022, 0.031 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/halpe.py b/modules/rtmpose/configs/_base_/datasets/halpe.py new file mode 100644 index 0000000..cccf9f4 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/halpe.py @@ -0,0 +1,1157 @@ +dataset_info = dict( + dataset_name='halpe', + paper_info=dict( + author='Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie' + ' and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu' + ' and Ma, Ze and Chen, Mingyang and Lu, Cewu', + title='PaStaNet: Toward Human Activity Knowledge Engine', + container='CVPR', + year='2020', + homepage='https://github.com/Fang-Haoshu/Halpe-FullBody/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict(name='head', id=17, color=[255, 128, 0], type='upper', swap=''), + 18: + dict(name='neck', id=18, color=[255, 128, 0], type='upper', swap=''), + 19: + dict(name='hip', id=19, color=[255, 128, 0], type='lower', swap=''), + 20: + dict( + name='left_big_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='right_big_toe'), + 21: + dict( + name='right_big_toe', + id=21, + color=[255, 128, 0], + type='lower', + swap='left_big_toe'), + 22: + dict( + name='left_small_toe', + id=22, + color=[255, 128, 0], + type='lower', + swap='right_small_toe'), + 23: + dict( + name='right_small_toe', + id=23, + color=[255, 128, 0], + type='lower', + swap='left_small_toe'), + 24: + dict( + name='left_heel', + id=24, + color=[255, 128, 0], + type='lower', + swap='right_heel'), + 25: + dict( + name='right_heel', + id=25, + color=[255, 128, 0], + type='lower', + swap='left_heel'), + 26: + dict( + name='face-0', + id=26, + color=[255, 255, 255], + type='', + swap='face-16'), + 27: + dict( + name='face-1', + id=27, + color=[255, 255, 255], + type='', + swap='face-15'), + 28: + dict( + name='face-2', + id=28, + color=[255, 255, 255], + type='', + swap='face-14'), + 29: + dict( + name='face-3', + id=29, + color=[255, 255, 255], + type='', + swap='face-13'), + 30: + dict( + name='face-4', + id=30, + color=[255, 255, 255], + type='', + swap='face-12'), + 31: + dict( + name='face-5', + id=31, + color=[255, 255, 255], + type='', + swap='face-11'), + 32: + dict( + name='face-6', + id=32, + color=[255, 255, 255], + type='', + swap='face-10'), + 33: + dict( + name='face-7', + id=33, + color=[255, 255, 255], + type='', + swap='face-9'), + 34: + dict(name='face-8', id=34, color=[255, 255, 255], type='', swap=''), + 35: + dict( + name='face-9', + id=35, + color=[255, 255, 255], + type='', + swap='face-7'), + 36: + dict( + name='face-10', + id=36, + color=[255, 255, 255], + type='', + swap='face-6'), + 37: + dict( + name='face-11', + id=37, + color=[255, 255, 255], + type='', + swap='face-5'), + 38: + dict( + name='face-12', + id=38, + color=[255, 255, 255], + type='', + swap='face-4'), + 39: + dict( + name='face-13', + id=39, + color=[255, 255, 255], + type='', + swap='face-3'), + 40: + dict( + name='face-14', + id=40, + color=[255, 255, 255], + type='', + swap='face-2'), + 41: + dict( + name='face-15', + id=41, + color=[255, 255, 255], + type='', + swap='face-1'), + 42: + dict( + name='face-16', + id=42, + color=[255, 255, 255], + type='', + swap='face-0'), + 43: + dict( + name='face-17', + id=43, + color=[255, 255, 255], + type='', + swap='face-26'), + 44: + dict( + name='face-18', + id=44, + color=[255, 255, 255], + type='', + swap='face-25'), + 45: + dict( + name='face-19', + id=45, + color=[255, 255, 255], + type='', + swap='face-24'), + 46: + dict( + name='face-20', + id=46, + color=[255, 255, 255], + type='', + swap='face-23'), + 47: + dict( + name='face-21', + id=47, + color=[255, 255, 255], + type='', + swap='face-22'), + 48: + dict( + name='face-22', + id=48, + color=[255, 255, 255], + type='', + swap='face-21'), + 49: + dict( + name='face-23', + id=49, + color=[255, 255, 255], + type='', + swap='face-20'), + 50: + dict( + name='face-24', + id=50, + color=[255, 255, 255], + type='', + swap='face-19'), + 51: + dict( + name='face-25', + id=51, + color=[255, 255, 255], + type='', + swap='face-18'), + 52: + dict( + name='face-26', + id=52, + color=[255, 255, 255], + type='', + swap='face-17'), + 53: + dict(name='face-27', id=53, color=[255, 255, 255], type='', swap=''), + 54: + dict(name='face-28', id=54, color=[255, 255, 255], type='', swap=''), + 55: + dict(name='face-29', id=55, color=[255, 255, 255], type='', swap=''), + 56: + dict(name='face-30', id=56, color=[255, 255, 255], type='', swap=''), + 57: + dict( + name='face-31', + id=57, + color=[255, 255, 255], + type='', + swap='face-35'), + 58: + dict( + name='face-32', + id=58, + color=[255, 255, 255], + type='', + swap='face-34'), + 59: + dict(name='face-33', id=59, color=[255, 255, 255], type='', swap=''), + 60: + dict( + name='face-34', + id=60, + color=[255, 255, 255], + type='', + swap='face-32'), + 61: + dict( + name='face-35', + id=61, + color=[255, 255, 255], + type='', + swap='face-31'), + 62: + dict( + name='face-36', + id=62, + color=[255, 255, 255], + type='', + swap='face-45'), + 63: + dict( + name='face-37', + id=63, + color=[255, 255, 255], + type='', + swap='face-44'), + 64: + dict( + name='face-38', + id=64, + color=[255, 255, 255], + type='', + swap='face-43'), + 65: + dict( + name='face-39', + id=65, + color=[255, 255, 255], + type='', + swap='face-42'), + 66: + dict( + name='face-40', + id=66, + color=[255, 255, 255], + type='', + swap='face-47'), + 67: + dict( + name='face-41', + id=67, + color=[255, 255, 255], + type='', + swap='face-46'), + 68: + dict( + name='face-42', + id=68, + color=[255, 255, 255], + type='', + swap='face-39'), + 69: + dict( + name='face-43', + id=69, + color=[255, 255, 255], + type='', + swap='face-38'), + 70: + dict( + name='face-44', + id=70, + color=[255, 255, 255], + type='', + swap='face-37'), + 71: + dict( + name='face-45', + id=71, + color=[255, 255, 255], + type='', + swap='face-36'), + 72: + dict( + name='face-46', + id=72, + color=[255, 255, 255], + type='', + swap='face-41'), + 73: + dict( + name='face-47', + id=73, + color=[255, 255, 255], + type='', + swap='face-40'), + 74: + dict( + name='face-48', + id=74, + color=[255, 255, 255], + type='', + swap='face-54'), + 75: + dict( + name='face-49', + id=75, + color=[255, 255, 255], + type='', + swap='face-53'), + 76: + dict( + name='face-50', + id=76, + color=[255, 255, 255], + type='', + swap='face-52'), + 77: + dict(name='face-51', id=77, color=[255, 255, 255], type='', swap=''), + 78: + dict( + name='face-52', + id=78, + color=[255, 255, 255], + type='', + swap='face-50'), + 79: + dict( + name='face-53', + id=79, + color=[255, 255, 255], + type='', + swap='face-49'), + 80: + dict( + name='face-54', + id=80, + color=[255, 255, 255], + type='', + swap='face-48'), + 81: + dict( + name='face-55', + id=81, + color=[255, 255, 255], + type='', + swap='face-59'), + 82: + dict( + name='face-56', + id=82, + color=[255, 255, 255], + type='', + swap='face-58'), + 83: + dict(name='face-57', id=83, color=[255, 255, 255], type='', swap=''), + 84: + dict( + name='face-58', + id=84, + color=[255, 255, 255], + type='', + swap='face-56'), + 85: + dict( + name='face-59', + id=85, + color=[255, 255, 255], + type='', + swap='face-55'), + 86: + dict( + name='face-60', + id=86, + color=[255, 255, 255], + type='', + swap='face-64'), + 87: + dict( + name='face-61', + id=87, + color=[255, 255, 255], + type='', + swap='face-63'), + 88: + dict(name='face-62', id=88, color=[255, 255, 255], type='', swap=''), + 89: + dict( + name='face-63', + id=89, + color=[255, 255, 255], + type='', + swap='face-61'), + 90: + dict( + name='face-64', + id=90, + color=[255, 255, 255], + type='', + swap='face-60'), + 91: + dict( + name='face-65', + id=91, + color=[255, 255, 255], + type='', + swap='face-67'), + 92: + dict(name='face-66', id=92, color=[255, 255, 255], type='', swap=''), + 93: + dict( + name='face-67', + id=93, + color=[255, 255, 255], + type='', + swap='face-65'), + 94: + dict( + name='left_hand_root', + id=94, + color=[255, 255, 255], + type='', + swap='right_hand_root'), + 95: + dict( + name='left_thumb1', + id=95, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 96: + dict( + name='left_thumb2', + id=96, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 97: + dict( + name='left_thumb3', + id=97, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 98: + dict( + name='left_thumb4', + id=98, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 99: + dict( + name='left_forefinger1', + id=99, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 100: + dict( + name='left_forefinger2', + id=100, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 101: + dict( + name='left_forefinger3', + id=101, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 102: + dict( + name='left_forefinger4', + id=102, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 103: + dict( + name='left_middle_finger1', + id=103, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 104: + dict( + name='left_middle_finger2', + id=104, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 105: + dict( + name='left_middle_finger3', + id=105, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 106: + dict( + name='left_middle_finger4', + id=106, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 107: + dict( + name='left_ring_finger1', + id=107, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 108: + dict( + name='left_ring_finger2', + id=108, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 109: + dict( + name='left_ring_finger3', + id=109, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 110: + dict( + name='left_ring_finger4', + id=110, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 111: + dict( + name='left_pinky_finger1', + id=111, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 112: + dict( + name='left_pinky_finger2', + id=112, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 113: + dict( + name='left_pinky_finger3', + id=113, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 114: + dict( + name='left_pinky_finger4', + id=114, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 115: + dict( + name='right_hand_root', + id=115, + color=[255, 255, 255], + type='', + swap='left_hand_root'), + 116: + dict( + name='right_thumb1', + id=116, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 117: + dict( + name='right_thumb2', + id=117, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 118: + dict( + name='right_thumb3', + id=118, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 119: + dict( + name='right_thumb4', + id=119, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 120: + dict( + name='right_forefinger1', + id=120, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 121: + dict( + name='right_forefinger2', + id=121, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 122: + dict( + name='right_forefinger3', + id=122, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 123: + dict( + name='right_forefinger4', + id=123, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 124: + dict( + name='right_middle_finger1', + id=124, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 125: + dict( + name='right_middle_finger2', + id=125, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 126: + dict( + name='right_middle_finger3', + id=126, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 127: + dict( + name='right_middle_finger4', + id=127, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 128: + dict( + name='right_ring_finger1', + id=128, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 129: + dict( + name='right_ring_finger2', + id=129, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 130: + dict( + name='right_ring_finger3', + id=130, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 131: + dict( + name='right_ring_finger4', + id=131, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 132: + dict( + name='right_pinky_finger1', + id=132, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 133: + dict( + name='right_pinky_finger2', + id=133, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 134: + dict( + name='right_pinky_finger3', + id=134, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 135: + dict( + name='right_pinky_finger4', + id=135, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('left_hip', 'hip'), id=2, color=[0, 255, 0]), + 3: + dict(link=('right_ankle', 'right_knee'), id=3, color=[255, 128, 0]), + 4: + dict(link=('right_knee', 'right_hip'), id=4, color=[255, 128, 0]), + 5: + dict(link=('right_hip', 'hip'), id=5, color=[255, 128, 0]), + 6: + dict(link=('head', 'neck'), id=6, color=[51, 153, 255]), + 7: + dict(link=('neck', 'hip'), id=7, color=[51, 153, 255]), + 8: + dict(link=('neck', 'left_shoulder'), id=8, color=[0, 255, 0]), + 9: + dict(link=('left_shoulder', 'left_elbow'), id=9, color=[0, 255, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('neck', 'right_shoulder'), id=11, color=[255, 128, 0]), + 12: + dict( + link=('right_shoulder', 'right_elbow'), id=12, color=[255, 128, + 0]), + 13: + dict(link=('right_elbow', 'right_wrist'), id=13, color=[255, 128, 0]), + 14: + dict(link=('left_eye', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('nose', 'left_eye'), id=15, color=[51, 153, 255]), + 16: + dict(link=('nose', 'right_eye'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_eye', 'left_ear'), id=17, color=[51, 153, 255]), + 18: + dict(link=('right_eye', 'right_ear'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ear', 'left_shoulder'), id=19, color=[51, 153, 255]), + 20: + dict( + link=('right_ear', 'right_shoulder'), id=20, color=[51, 153, 255]), + 21: + dict(link=('left_ankle', 'left_big_toe'), id=21, color=[0, 255, 0]), + 22: + dict(link=('left_ankle', 'left_small_toe'), id=22, color=[0, 255, 0]), + 23: + dict(link=('left_ankle', 'left_heel'), id=23, color=[0, 255, 0]), + 24: + dict( + link=('right_ankle', 'right_big_toe'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('right_ankle', 'right_small_toe'), + id=25, + color=[255, 128, 0]), + 26: + dict(link=('right_ankle', 'right_heel'), id=26, color=[255, 128, 0]), + 27: + dict(link=('left_wrist', 'left_thumb1'), id=27, color=[255, 128, 0]), + 28: + dict(link=('left_thumb1', 'left_thumb2'), id=28, color=[255, 128, 0]), + 29: + dict(link=('left_thumb2', 'left_thumb3'), id=29, color=[255, 128, 0]), + 30: + dict(link=('left_thumb3', 'left_thumb4'), id=30, color=[255, 128, 0]), + 31: + dict( + link=('left_wrist', 'left_forefinger1'), + id=31, + color=[255, 153, 255]), + 32: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=32, + color=[255, 153, 255]), + 33: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=33, + color=[255, 153, 255]), + 34: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=34, + color=[255, 153, 255]), + 35: + dict( + link=('left_wrist', 'left_middle_finger1'), + id=35, + color=[102, 178, 255]), + 36: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=36, + color=[102, 178, 255]), + 37: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=37, + color=[102, 178, 255]), + 38: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=38, + color=[102, 178, 255]), + 39: + dict( + link=('left_wrist', 'left_ring_finger1'), + id=39, + color=[255, 51, 51]), + 40: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=40, + color=[255, 51, 51]), + 41: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=41, + color=[255, 51, 51]), + 42: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=42, + color=[255, 51, 51]), + 43: + dict( + link=('left_wrist', 'left_pinky_finger1'), + id=43, + color=[0, 255, 0]), + 44: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=44, + color=[0, 255, 0]), + 45: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=45, + color=[0, 255, 0]), + 46: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=46, + color=[0, 255, 0]), + 47: + dict(link=('right_wrist', 'right_thumb1'), id=47, color=[255, 128, 0]), + 48: + dict( + link=('right_thumb1', 'right_thumb2'), id=48, color=[255, 128, 0]), + 49: + dict( + link=('right_thumb2', 'right_thumb3'), id=49, color=[255, 128, 0]), + 50: + dict( + link=('right_thumb3', 'right_thumb4'), id=50, color=[255, 128, 0]), + 51: + dict( + link=('right_wrist', 'right_forefinger1'), + id=51, + color=[255, 153, 255]), + 52: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=52, + color=[255, 153, 255]), + 53: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=53, + color=[255, 153, 255]), + 54: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=54, + color=[255, 153, 255]), + 55: + dict( + link=('right_wrist', 'right_middle_finger1'), + id=55, + color=[102, 178, 255]), + 56: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=56, + color=[102, 178, 255]), + 57: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=57, + color=[102, 178, 255]), + 58: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=58, + color=[102, 178, 255]), + 59: + dict( + link=('right_wrist', 'right_ring_finger1'), + id=59, + color=[255, 51, 51]), + 60: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=60, + color=[255, 51, 51]), + 61: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=61, + color=[255, 51, 51]), + 62: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=62, + color=[255, 51, 51]), + 63: + dict( + link=('right_wrist', 'right_pinky_finger1'), + id=63, + color=[0, 255, 0]), + 64: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=64, + color=[0, 255, 0]), + 65: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=65, + color=[0, 255, 0]), + 66: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=66, + color=[0, 255, 0]) + }, + joint_weights=[1.] * 136, + + # 'https://github.com/Fang-Haoshu/Halpe-FullBody/blob/master/' + # 'HalpeCOCOAPI/PythonAPI/halpecocotools/cocoeval.py#L245' + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.08, 0.08, 0.08, + 0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, + 0.015, 0.015, 0.015, 0.015, 0.015, 0.015 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/halpe26.py b/modules/rtmpose/configs/_base_/datasets/halpe26.py new file mode 100644 index 0000000..7f4d549 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/halpe26.py @@ -0,0 +1,274 @@ +dataset_info = dict( + dataset_name='halpe26', + paper_info=dict( + author='Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie' + ' and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu' + ' and Ma, Ze and Chen, Mingyang and Lu, Cewu', + title='PaStaNet: Toward Human Activity Knowledge Engine', + container='CVPR', + year='2020', + homepage='https://github.com/Fang-Haoshu/Halpe-FullBody/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict(name='head', id=17, color=[255, 128, 0], type='upper', swap=''), + 18: + dict(name='neck', id=18, color=[255, 128, 0], type='upper', swap=''), + 19: + dict(name='hip', id=19, color=[255, 128, 0], type='lower', swap=''), + 20: + dict( + name='left_big_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='right_big_toe'), + 21: + dict( + name='right_big_toe', + id=21, + color=[255, 128, 0], + type='lower', + swap='left_big_toe'), + 22: + dict( + name='left_small_toe', + id=22, + color=[255, 128, 0], + type='lower', + swap='right_small_toe'), + 23: + dict( + name='right_small_toe', + id=23, + color=[255, 128, 0], + type='lower', + swap='left_small_toe'), + 24: + dict( + name='left_heel', + id=24, + color=[255, 128, 0], + type='lower', + swap='right_heel'), + 25: + dict( + name='right_heel', + id=25, + color=[255, 128, 0], + type='lower', + swap='left_heel') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('left_hip', 'hip'), id=2, color=[0, 255, 0]), + 3: + dict(link=('right_ankle', 'right_knee'), id=3, color=[255, 128, 0]), + 4: + dict(link=('right_knee', 'right_hip'), id=4, color=[255, 128, 0]), + 5: + dict(link=('right_hip', 'hip'), id=5, color=[255, 128, 0]), + 6: + dict(link=('head', 'neck'), id=6, color=[51, 153, 255]), + 7: + dict(link=('neck', 'hip'), id=7, color=[51, 153, 255]), + 8: + dict(link=('neck', 'left_shoulder'), id=8, color=[0, 255, 0]), + 9: + dict(link=('left_shoulder', 'left_elbow'), id=9, color=[0, 255, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('neck', 'right_shoulder'), id=11, color=[255, 128, 0]), + 12: + dict( + link=('right_shoulder', 'right_elbow'), id=12, color=[255, 128, + 0]), + 13: + dict(link=('right_elbow', 'right_wrist'), id=13, color=[255, 128, 0]), + 14: + dict(link=('left_eye', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('nose', 'left_eye'), id=15, color=[51, 153, 255]), + 16: + dict(link=('nose', 'right_eye'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_eye', 'left_ear'), id=17, color=[51, 153, 255]), + 18: + dict(link=('right_eye', 'right_ear'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ear', 'left_shoulder'), id=19, color=[51, 153, 255]), + 20: + dict( + link=('right_ear', 'right_shoulder'), id=20, color=[51, 153, 255]), + 21: + dict(link=('left_ankle', 'left_big_toe'), id=21, color=[0, 255, 0]), + 22: + dict(link=('left_ankle', 'left_small_toe'), id=22, color=[0, 255, 0]), + 23: + dict(link=('left_ankle', 'left_heel'), id=23, color=[0, 255, 0]), + 24: + dict( + link=('right_ankle', 'right_big_toe'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('right_ankle', 'right_small_toe'), + id=25, + color=[255, 128, 0]), + 26: + dict(link=('right_ankle', 'right_heel'), id=26, color=[255, 128, 0]), + }, + # the joint_weights is modified by MMPose Team + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ] + [1., 1., 1.2] + [1.5] * 6, + + # 'https://github.com/Fang-Haoshu/Halpe-FullBody/blob/master/' + # 'HalpeCOCOAPI/PythonAPI/halpecocotools/cocoeval.py#L245' + sigmas=[ + 0.026, + 0.025, + 0.025, + 0.035, + 0.035, + 0.079, + 0.079, + 0.072, + 0.072, + 0.062, + 0.062, + 0.107, + 0.107, + 0.087, + 0.087, + 0.089, + 0.089, + 0.026, + 0.026, + 0.066, + 0.079, + 0.079, + 0.079, + 0.079, + 0.079, + 0.079, + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/horse10.py b/modules/rtmpose/configs/_base_/datasets/horse10.py new file mode 100644 index 0000000..60cec1f --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/horse10.py @@ -0,0 +1,201 @@ +dataset_info = dict( + dataset_name='horse10', + paper_info=dict( + author='Mathis, Alexander and Biasi, Thomas and ' + 'Schneider, Steffen and ' + 'Yuksekgonul, Mert and Rogers, Byron and ' + 'Bethge, Matthias and ' + 'Mathis, Mackenzie W', + title='Pretraining boosts out-of-domain robustness ' + 'for pose estimation', + container='Proceedings of the IEEE/CVF Winter Conference on ' + 'Applications of Computer Vision', + year='2021', + homepage='http://www.mackenziemathislab.org/horse10', + ), + keypoint_info={ + 0: + dict(name='Nose', id=0, color=[255, 153, 255], type='upper', swap=''), + 1: + dict(name='Eye', id=1, color=[255, 153, 255], type='upper', swap=''), + 2: + dict( + name='Nearknee', + id=2, + color=[255, 102, 255], + type='upper', + swap=''), + 3: + dict( + name='Nearfrontfetlock', + id=3, + color=[255, 102, 255], + type='upper', + swap=''), + 4: + dict( + name='Nearfrontfoot', + id=4, + color=[255, 102, 255], + type='upper', + swap=''), + 5: + dict( + name='Offknee', id=5, color=[255, 102, 255], type='upper', + swap=''), + 6: + dict( + name='Offfrontfetlock', + id=6, + color=[255, 102, 255], + type='upper', + swap=''), + 7: + dict( + name='Offfrontfoot', + id=7, + color=[255, 102, 255], + type='upper', + swap=''), + 8: + dict( + name='Shoulder', + id=8, + color=[255, 153, 255], + type='upper', + swap=''), + 9: + dict( + name='Midshoulder', + id=9, + color=[255, 153, 255], + type='upper', + swap=''), + 10: + dict( + name='Elbow', id=10, color=[255, 153, 255], type='upper', swap=''), + 11: + dict( + name='Girth', id=11, color=[255, 153, 255], type='upper', swap=''), + 12: + dict( + name='Wither', id=12, color=[255, 153, 255], type='upper', + swap=''), + 13: + dict( + name='Nearhindhock', + id=13, + color=[255, 51, 255], + type='lower', + swap=''), + 14: + dict( + name='Nearhindfetlock', + id=14, + color=[255, 51, 255], + type='lower', + swap=''), + 15: + dict( + name='Nearhindfoot', + id=15, + color=[255, 51, 255], + type='lower', + swap=''), + 16: + dict(name='Hip', id=16, color=[255, 153, 255], type='lower', swap=''), + 17: + dict( + name='Stifle', id=17, color=[255, 153, 255], type='lower', + swap=''), + 18: + dict( + name='Offhindhock', + id=18, + color=[255, 51, 255], + type='lower', + swap=''), + 19: + dict( + name='Offhindfetlock', + id=19, + color=[255, 51, 255], + type='lower', + swap=''), + 20: + dict( + name='Offhindfoot', + id=20, + color=[255, 51, 255], + type='lower', + swap=''), + 21: + dict( + name='Ischium', + id=21, + color=[255, 153, 255], + type='lower', + swap='') + }, + skeleton_info={ + 0: + dict(link=('Nose', 'Eye'), id=0, color=[255, 153, 255]), + 1: + dict(link=('Eye', 'Wither'), id=1, color=[255, 153, 255]), + 2: + dict(link=('Wither', 'Hip'), id=2, color=[255, 153, 255]), + 3: + dict(link=('Hip', 'Ischium'), id=3, color=[255, 153, 255]), + 4: + dict(link=('Ischium', 'Stifle'), id=4, color=[255, 153, 255]), + 5: + dict(link=('Stifle', 'Girth'), id=5, color=[255, 153, 255]), + 6: + dict(link=('Girth', 'Elbow'), id=6, color=[255, 153, 255]), + 7: + dict(link=('Elbow', 'Shoulder'), id=7, color=[255, 153, 255]), + 8: + dict(link=('Shoulder', 'Midshoulder'), id=8, color=[255, 153, 255]), + 9: + dict(link=('Midshoulder', 'Wither'), id=9, color=[255, 153, 255]), + 10: + dict( + link=('Nearknee', 'Nearfrontfetlock'), + id=10, + color=[255, 102, 255]), + 11: + dict( + link=('Nearfrontfetlock', 'Nearfrontfoot'), + id=11, + color=[255, 102, 255]), + 12: + dict( + link=('Offknee', 'Offfrontfetlock'), id=12, color=[255, 102, 255]), + 13: + dict( + link=('Offfrontfetlock', 'Offfrontfoot'), + id=13, + color=[255, 102, 255]), + 14: + dict( + link=('Nearhindhock', 'Nearhindfetlock'), + id=14, + color=[255, 51, 255]), + 15: + dict( + link=('Nearhindfetlock', 'Nearhindfoot'), + id=15, + color=[255, 51, 255]), + 16: + dict( + link=('Offhindhock', 'Offhindfetlock'), + id=16, + color=[255, 51, 255]), + 17: + dict( + link=('Offhindfetlock', 'Offhindfoot'), + id=17, + color=[255, 51, 255]) + }, + joint_weights=[1.] * 22, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/humanart.py b/modules/rtmpose/configs/_base_/datasets/humanart.py new file mode 100644 index 0000000..8054926 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/humanart.py @@ -0,0 +1,181 @@ +dataset_info = dict( + dataset_name='Human-Art', + paper_info=dict( + author='Ju, Xuan and Zeng, Ailing and ' + 'Wang, Jianan and Xu, Qiang and Zhang, Lei', + title='Human-Art: A Versatile Human-Centric Dataset ' + 'Bridging Natural and Artificial Scenes', + container='Proceedings of the IEEE/CVF Conference on ' + 'Computer Vision and Pattern Recognition', + year='2023', + homepage='https://idea-research.github.io/HumanArt/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/humanart21.py b/modules/rtmpose/configs/_base_/datasets/humanart21.py new file mode 100644 index 0000000..b8cb226 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/humanart21.py @@ -0,0 +1,218 @@ +dataset_info = dict( + dataset_name='Human-Art', + paper_info=dict( + author='Ju, Xuan and Zeng, Ailing and ' + 'Wang, Jianan and Xu, Qiang and Zhang, Lei', + title='Human-Art: A Versatile Human-Centric Dataset ' + 'Bridging Natural and Artificial Scenes', + container='Proceedings of the IEEE/CVF Conference on ' + 'Computer Vision and Pattern Recognition', + year='2023', + homepage='https://idea-research.github.io/HumanArt/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='left_finger', + id=17, + color=[0, 255, 0], + type='lower', + swap='right_finger'), + 18: + dict( + name='right_finger', + id=18, + color=[255, 128, 0], + type='lower', + swap='left_finger'), + 19: + dict( + name='left_toe', + id=19, + color=[0, 255, 0], + type='lower', + swap='right_toe'), + 20: + dict( + name='right_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='left_toe'), + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ankle', 'left_toe'), id=19, color=[0, 255, 0]), + 20: + dict(link=('right_ankle', 'right_toe'), id=20, color=[255, 128, 0]), + 21: + dict(link=('left_wrist', 'left_finger'), id=21, color=[0, 255, 0]), + 22: + dict(link=('right_wrist', 'right_finger'), id=22, color=[255, 128, 0]), + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5, 1., 1., 1., 1. + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089, 0.089, + 0.089 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/humanart_aic.py b/modules/rtmpose/configs/_base_/datasets/humanart_aic.py new file mode 100644 index 0000000..573e9a0 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/humanart_aic.py @@ -0,0 +1,205 @@ +dataset_info = dict( + dataset_name='humanart', + paper_info=[ + dict( + author='Ju, Xuan and Zeng, Ailing and ' + 'Wang, Jianan and Xu, Qiang and Zhang, ' + 'Lei', + title='Human-Art: A Versatile Human-Centric Dataset ' + 'Bridging Natural and Artificial Scenes', + container='CVPR', + year='2023', + homepage='https://idea-research.github.io/HumanArt/', + ), + dict( + author='Wu, Jiahong and Zheng, He and Zhao, Bo and ' + 'Li, Yixin and Yan, Baoming and Liang, Rui and ' + 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and ' + 'Fu, Yanwei and others', + title='Ai challenger: A large-scale dataset for going ' + 'deeper in image understanding', + container='arXiv', + year='2017', + homepage='https://github.com/AIChallenger/AI_Challenger_2017', + ), + ], + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='head_top', + id=17, + color=[51, 153, 255], + type='upper', + swap=''), + 18: + dict(name='neck', id=18, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]), + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5, 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.026, 0.026 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/interhand2d.py b/modules/rtmpose/configs/_base_/datasets/interhand2d.py new file mode 100644 index 0000000..e60dfc7 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/interhand2d.py @@ -0,0 +1,142 @@ +dataset_info = dict( + dataset_name='interhand2d', + paper_info=dict( + author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and ' + 'Shiratori, Takaaki and Lee, Kyoung Mu', + title='InterHand2.6M: A dataset and baseline for 3D ' + 'interacting hand pose estimation from a single RGB image', + container='arXiv', + year='2020', + homepage='https://mks0601.github.io/InterHand2.6M/', + ), + keypoint_info={ + 0: + dict(name='thumb4', id=0, color=[255, 128, 0], type='', swap=''), + 1: + dict(name='thumb3', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb1', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict( + name='forefinger4', id=4, color=[255, 153, 255], type='', swap=''), + 5: + dict( + name='forefinger3', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger1', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='middle_finger4', + id=8, + color=[102, 178, 255], + type='', + swap=''), + 9: + dict( + name='middle_finger3', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger1', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='ring_finger4', id=12, color=[255, 51, 51], type='', swap=''), + 13: + dict( + name='ring_finger3', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger1', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict(name='pinky_finger4', id=16, color=[0, 255, 0], type='', swap=''), + 17: + dict(name='pinky_finger3', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger1', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='wrist', id=20, color=[255, 255, 255], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/interhand3d.py b/modules/rtmpose/configs/_base_/datasets/interhand3d.py new file mode 100644 index 0000000..26b7ccf --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/interhand3d.py @@ -0,0 +1,487 @@ +dataset_info = dict( + dataset_name='interhand3d', + paper_info=dict( + author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and ' + 'Shiratori, Takaaki and Lee, Kyoung Mu', + title='InterHand2.6M: A dataset and baseline for 3D ' + 'interacting hand pose estimation from a single RGB image', + container='arXiv', + year='2020', + homepage='https://mks0601.github.io/InterHand2.6M/', + ), + keypoint_info={ + 0: + dict( + name='right_thumb4', + id=0, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 1: + dict( + name='right_thumb3', + id=1, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 2: + dict( + name='right_thumb2', + id=2, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 3: + dict( + name='right_thumb1', + id=3, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 4: + dict( + name='right_forefinger4', + id=4, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 5: + dict( + name='right_forefinger3', + id=5, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 6: + dict( + name='right_forefinger2', + id=6, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 7: + dict( + name='right_forefinger1', + id=7, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 8: + dict( + name='right_middle_finger4', + id=8, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 9: + dict( + name='right_middle_finger3', + id=9, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 10: + dict( + name='right_middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 11: + dict( + name='right_middle_finger1', + id=11, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 12: + dict( + name='right_ring_finger4', + id=12, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 13: + dict( + name='right_ring_finger3', + id=13, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 14: + dict( + name='right_ring_finger2', + id=14, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 15: + dict( + name='right_ring_finger1', + id=15, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 16: + dict( + name='right_pinky_finger4', + id=16, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4'), + 17: + dict( + name='right_pinky_finger3', + id=17, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 18: + dict( + name='right_pinky_finger2', + id=18, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 19: + dict( + name='right_pinky_finger1', + id=19, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 20: + dict( + name='right_wrist', + id=20, + color=[255, 255, 255], + type='', + swap='left_wrist'), + 21: + dict( + name='left_thumb4', + id=21, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 22: + dict( + name='left_thumb3', + id=22, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 23: + dict( + name='left_thumb2', + id=23, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 24: + dict( + name='left_thumb1', + id=24, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 25: + dict( + name='left_forefinger4', + id=25, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 26: + dict( + name='left_forefinger3', + id=26, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 27: + dict( + name='left_forefinger2', + id=27, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 28: + dict( + name='left_forefinger1', + id=28, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 29: + dict( + name='left_middle_finger4', + id=29, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 30: + dict( + name='left_middle_finger3', + id=30, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 31: + dict( + name='left_middle_finger2', + id=31, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 32: + dict( + name='left_middle_finger1', + id=32, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 33: + dict( + name='left_ring_finger4', + id=33, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 34: + dict( + name='left_ring_finger3', + id=34, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 35: + dict( + name='left_ring_finger2', + id=35, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 36: + dict( + name='left_ring_finger1', + id=36, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 37: + dict( + name='left_pinky_finger4', + id=37, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 38: + dict( + name='left_pinky_finger3', + id=38, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 39: + dict( + name='left_pinky_finger2', + id=39, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 40: + dict( + name='left_pinky_finger1', + id=40, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 41: + dict( + name='left_wrist', + id=41, + color=[255, 255, 255], + type='', + swap='right_wrist'), + }, + skeleton_info={ + 0: + dict(link=('right_wrist', 'right_thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_thumb1', 'right_thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_thumb2', 'right_thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_thumb3', 'right_thumb4'), id=3, color=[255, 128, 0]), + 4: + dict( + link=('right_wrist', 'right_forefinger1'), + id=4, + color=[255, 153, 255]), + 5: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=5, + color=[255, 153, 255]), + 6: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=6, + color=[255, 153, 255]), + 7: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=7, + color=[255, 153, 255]), + 8: + dict( + link=('right_wrist', 'right_middle_finger1'), + id=8, + color=[102, 178, 255]), + 9: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict( + link=('right_wrist', 'right_ring_finger1'), + id=12, + color=[255, 51, 51]), + 13: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=13, + color=[255, 51, 51]), + 14: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=14, + color=[255, 51, 51]), + 15: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=15, + color=[255, 51, 51]), + 16: + dict( + link=('right_wrist', 'right_pinky_finger1'), + id=16, + color=[0, 255, 0]), + 17: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=17, + color=[0, 255, 0]), + 18: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=18, + color=[0, 255, 0]), + 19: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=19, + color=[0, 255, 0]), + 20: + dict(link=('left_wrist', 'left_thumb1'), id=20, color=[255, 128, 0]), + 21: + dict(link=('left_thumb1', 'left_thumb2'), id=21, color=[255, 128, 0]), + 22: + dict(link=('left_thumb2', 'left_thumb3'), id=22, color=[255, 128, 0]), + 23: + dict(link=('left_thumb3', 'left_thumb4'), id=23, color=[255, 128, 0]), + 24: + dict( + link=('left_wrist', 'left_forefinger1'), + id=24, + color=[255, 153, 255]), + 25: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=25, + color=[255, 153, 255]), + 26: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=26, + color=[255, 153, 255]), + 27: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=27, + color=[255, 153, 255]), + 28: + dict( + link=('left_wrist', 'left_middle_finger1'), + id=28, + color=[102, 178, 255]), + 29: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=29, + color=[102, 178, 255]), + 30: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=30, + color=[102, 178, 255]), + 31: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=31, + color=[102, 178, 255]), + 32: + dict( + link=('left_wrist', 'left_ring_finger1'), + id=32, + color=[255, 51, 51]), + 33: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=33, + color=[255, 51, 51]), + 34: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=34, + color=[255, 51, 51]), + 35: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=35, + color=[255, 51, 51]), + 36: + dict( + link=('left_wrist', 'left_pinky_finger1'), + id=36, + color=[0, 255, 0]), + 37: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=37, + color=[0, 255, 0]), + 38: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=38, + color=[0, 255, 0]), + 39: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=39, + color=[0, 255, 0]), + }, + joint_weights=[1.] * 42, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/jhmdb.py b/modules/rtmpose/configs/_base_/datasets/jhmdb.py new file mode 100644 index 0000000..1f931fc --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/jhmdb.py @@ -0,0 +1,129 @@ +dataset_info = dict( + dataset_name='jhmdb', + paper_info=dict( + author='H. Jhuang and J. Gall and S. Zuffi and ' + 'C. Schmid and M. J. Black', + title='Towards understanding action recognition', + container='International Conf. on Computer Vision (ICCV)', + year='2013', + homepage='http://jhmdb.is.tue.mpg.de/dataset', + ), + keypoint_info={ + 0: + dict(name='neck', id=0, color=[255, 128, 0], type='upper', swap=''), + 1: + dict(name='belly', id=1, color=[255, 128, 0], type='upper', swap=''), + 2: + dict(name='head', id=2, color=[255, 128, 0], type='upper', swap=''), + 3: + dict( + name='right_shoulder', + id=3, + color=[0, 255, 0], + type='upper', + swap='left_shoulder'), + 4: + dict( + name='left_shoulder', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 5: + dict( + name='right_hip', + id=5, + color=[0, 255, 0], + type='lower', + swap='left_hip'), + 6: + dict( + name='left_hip', + id=6, + color=[51, 153, 255], + type='lower', + swap='right_hip'), + 7: + dict( + name='right_elbow', + id=7, + color=[51, 153, 255], + type='upper', + swap='left_elbow'), + 8: + dict( + name='left_elbow', + id=8, + color=[51, 153, 255], + type='upper', + swap='right_elbow'), + 9: + dict( + name='right_knee', + id=9, + color=[51, 153, 255], + type='lower', + swap='left_knee'), + 10: + dict( + name='left_knee', + id=10, + color=[255, 128, 0], + type='lower', + swap='right_knee'), + 11: + dict( + name='right_wrist', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 12: + dict( + name='left_wrist', + id=12, + color=[255, 128, 0], + type='upper', + swap='right_wrist'), + 13: + dict( + name='right_ankle', + id=13, + color=[0, 255, 0], + type='lower', + swap='left_ankle'), + 14: + dict( + name='left_ankle', + id=14, + color=[0, 255, 0], + type='lower', + swap='right_ankle') + }, + skeleton_info={ + 0: dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: dict(link=('right_hip', 'belly'), id=2, color=[255, 128, 0]), + 3: dict(link=('belly', 'left_hip'), id=3, color=[0, 255, 0]), + 4: dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), + 5: dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), + 6: dict(link=('belly', 'neck'), id=6, color=[51, 153, 255]), + 7: dict(link=('neck', 'head'), id=7, color=[51, 153, 255]), + 8: dict(link=('neck', 'right_shoulder'), id=8, color=[255, 128, 0]), + 9: dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('right_elbow', 'right_wrist'), id=10, color=[255, 128, 0]), + 11: dict(link=('neck', 'left_shoulder'), id=11, color=[0, 255, 0]), + 12: + dict(link=('left_shoulder', 'left_elbow'), id=12, color=[0, 255, 0]), + 13: dict(link=('left_elbow', 'left_wrist'), id=13, color=[0, 255, 0]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2, 1.5, 1.5, 1.5, 1.5 + ], + # Adapted from COCO dataset. + sigmas=[ + 0.025, 0.107, 0.025, 0.079, 0.079, 0.107, 0.107, 0.072, 0.072, 0.087, + 0.087, 0.062, 0.062, 0.089, 0.089 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/lapa.py b/modules/rtmpose/configs/_base_/datasets/lapa.py new file mode 100644 index 0000000..56ff1e6 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/lapa.py @@ -0,0 +1,246 @@ +dataset_info = dict( + dataset_name='lapa', + paper_info=dict( + author='Liu, Yinglu and Shi, Hailin and Shen, Hao and Si, ' + 'Yue and Wang, Xiaobo and Mei, Tao', + title='A New Dataset and Boundary-Attention Semantic ' + 'Segmentation for Face Parsing.', + container='Proceedings of the AAAI Conference on ' + 'Artificial Intelligence 2020', + year='2020', + homepage='https://github.com/JDAI-CV/lapa-dataset', + ), + keypoint_info={ + 0: + dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-32'), + 1: + dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-31'), + 2: + dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-30'), + 3: + dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-29'), + 4: + dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-28'), + 5: + dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-27'), + 6: + dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-26'), + 7: + dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-25'), + 8: + dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-24'), + 9: + dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-23'), + 10: + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-22'), + 11: + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-21'), + 12: + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-20'), + 13: + dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-19'), + 14: + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-18'), + 15: + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-17'), + 16: + dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''), + 17: + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-15'), + 18: + dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-14'), + 19: + dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-13'), + 20: + dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-12'), + 21: + dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-11'), + 22: + dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-10'), + 23: + dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-9'), + 24: + dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-8'), + 25: + dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-7'), + 26: + dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-6'), + 27: + dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap='kpt-5'), + 28: + dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap='kpt-4'), + 29: + dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap='kpt-3'), + 30: + dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap='kpt-2'), + 31: + dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-1'), + 32: + dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-0'), + 33: + dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap='kpt-46'), + 34: + dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-45'), + 35: + dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-44'), + 36: + dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-43'), + 37: + dict(name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-42'), + 38: + dict(name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-50'), + 39: + dict(name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-49'), + 40: + dict(name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-48'), + 41: + dict(name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-47'), + 42: + dict(name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-37'), + 43: + dict(name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-36'), + 44: + dict(name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-35'), + 45: + dict(name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-34'), + 46: + dict(name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-33'), + 47: + dict(name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-41'), + 48: + dict(name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-40'), + 49: + dict(name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-39'), + 50: + dict(name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-38'), + 51: + dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''), + 52: + dict(name='kpt-52', id=52, color=[255, 0, 0], type='', swap=''), + 53: + dict(name='kpt-53', id=53, color=[255, 0, 0], type='', swap=''), + 54: + dict(name='kpt-54', id=54, color=[255, 0, 0], type='', swap=''), + 55: + dict(name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-65'), + 56: + dict(name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-64'), + 57: + dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap='kpt-63'), + 58: + dict(name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-62'), + 59: + dict(name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-61'), + 60: + dict(name='kpt-60', id=60, color=[255, 0, 0], type='', swap=''), + 61: + dict(name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-59'), + 62: + dict(name='kpt-62', id=62, color=[255, 0, 0], type='', swap='kpt-58'), + 63: + dict(name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-57'), + 64: + dict(name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-56'), + 65: + dict(name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-55'), + 66: + dict(name='kpt-66', id=66, color=[255, 0, 0], type='', swap='kpt-79'), + 67: + dict(name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-78'), + 68: + dict(name='kpt-68', id=68, color=[255, 0, 0], type='', swap='kpt-77'), + 69: + dict(name='kpt-69', id=69, color=[255, 0, 0], type='', swap='kpt-76'), + 70: + dict(name='kpt-70', id=70, color=[255, 0, 0], type='', swap='kpt-75'), + 71: + dict(name='kpt-71', id=71, color=[255, 0, 0], type='', swap='kpt-82'), + 72: + dict(name='kpt-72', id=72, color=[255, 0, 0], type='', swap='kpt-81'), + 73: + dict(name='kpt-73', id=73, color=[255, 0, 0], type='', swap='kpt-80'), + 74: + dict(name='kpt-74', id=74, color=[255, 0, 0], type='', swap='kpt-83'), + 75: + dict(name='kpt-75', id=75, color=[255, 0, 0], type='', swap='kpt-70'), + 76: + dict(name='kpt-76', id=76, color=[255, 0, 0], type='', swap='kpt-69'), + 77: + dict(name='kpt-77', id=77, color=[255, 0, 0], type='', swap='kpt-68'), + 78: + dict(name='kpt-78', id=78, color=[255, 0, 0], type='', swap='kpt-67'), + 79: + dict(name='kpt-79', id=79, color=[255, 0, 0], type='', swap='kpt-66'), + 80: + dict(name='kpt-80', id=80, color=[255, 0, 0], type='', swap='kpt-73'), + 81: + dict(name='kpt-81', id=81, color=[255, 0, 0], type='', swap='kpt-72'), + 82: + dict(name='kpt-82', id=82, color=[255, 0, 0], type='', swap='kpt-71'), + 83: + dict(name='kpt-83', id=83, color=[255, 0, 0], type='', swap='kpt-74'), + 84: + dict(name='kpt-84', id=84, color=[255, 0, 0], type='', swap='kpt-90'), + 85: + dict(name='kpt-85', id=85, color=[255, 0, 0], type='', swap='kpt-89'), + 86: + dict(name='kpt-86', id=86, color=[255, 0, 0], type='', swap='kpt-88'), + 87: + dict(name='kpt-87', id=87, color=[255, 0, 0], type='', swap=''), + 88: + dict(name='kpt-88', id=88, color=[255, 0, 0], type='', swap='kpt-86'), + 89: + dict(name='kpt-89', id=89, color=[255, 0, 0], type='', swap='kpt-85'), + 90: + dict(name='kpt-90', id=90, color=[255, 0, 0], type='', swap='kpt-84'), + 91: + dict(name='kpt-91', id=91, color=[255, 0, 0], type='', swap='kpt-95'), + 92: + dict(name='kpt-92', id=92, color=[255, 0, 0], type='', swap='kpt-94'), + 93: + dict(name='kpt-93', id=93, color=[255, 0, 0], type='', swap=''), + 94: + dict(name='kpt-94', id=94, color=[255, 0, 0], type='', swap='kpt-92'), + 95: + dict(name='kpt-95', id=95, color=[255, 0, 0], type='', swap='kpt-91'), + 96: + dict(name='kpt-96', id=96, color=[255, 0, 0], type='', swap='kpt-100'), + 97: + dict(name='kpt-97', id=97, color=[255, 0, 0], type='', swap='kpt-99'), + 98: + dict(name='kpt-98', id=98, color=[255, 0, 0], type='', swap=''), + 99: + dict(name='kpt-99', id=99, color=[255, 0, 0], type='', swap='kpt-97'), + 100: + dict( + name='kpt-100', id=100, color=[255, 0, 0], type='', swap='kpt-96'), + 101: + dict( + name='kpt-101', id=101, color=[255, 0, 0], type='', + swap='kpt-103'), + 102: + dict(name='kpt-102', id=102, color=[255, 0, 0], type='', swap=''), + 103: + dict( + name='kpt-103', id=103, color=[255, 0, 0], type='', + swap='kpt-101'), + 104: + dict( + name='kpt-104', id=104, color=[255, 0, 0], type='', + swap='kpt-105'), + 105: + dict( + name='kpt-105', id=105, color=[255, 0, 0], type='', swap='kpt-104') + }, + skeleton_info={}, + joint_weights=[ + 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, + 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, + 0.8, 0.8, 0.8, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, + 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, + 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.0, 1.0 + ], + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/locust.py b/modules/rtmpose/configs/_base_/datasets/locust.py new file mode 100644 index 0000000..3a6fafd --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/locust.py @@ -0,0 +1,263 @@ +dataset_info = dict( + dataset_name='locust', + paper_info=dict( + author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and ' + 'Li, Liang and Koger, Benjamin and Costelloe, Blair R and ' + 'Couzin, Iain D', + title='DeepPoseKit, a software toolkit for fast and robust ' + 'animal pose estimation using deep learning', + container='Elife', + year='2019', + homepage='https://github.com/jgraving/DeepPoseKit-Data', + ), + keypoint_info={ + 0: + dict(name='head', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='neck', id=1, color=[255, 255, 255], type='', swap=''), + 2: + dict(name='thorax', id=2, color=[255, 255, 255], type='', swap=''), + 3: + dict(name='abdomen1', id=3, color=[255, 255, 255], type='', swap=''), + 4: + dict(name='abdomen2', id=4, color=[255, 255, 255], type='', swap=''), + 5: + dict( + name='anttipL', + id=5, + color=[255, 255, 255], + type='', + swap='anttipR'), + 6: + dict( + name='antbaseL', + id=6, + color=[255, 255, 255], + type='', + swap='antbaseR'), + 7: + dict(name='eyeL', id=7, color=[255, 255, 255], type='', swap='eyeR'), + 8: + dict( + name='forelegL1', + id=8, + color=[255, 255, 255], + type='', + swap='forelegR1'), + 9: + dict( + name='forelegL2', + id=9, + color=[255, 255, 255], + type='', + swap='forelegR2'), + 10: + dict( + name='forelegL3', + id=10, + color=[255, 255, 255], + type='', + swap='forelegR3'), + 11: + dict( + name='forelegL4', + id=11, + color=[255, 255, 255], + type='', + swap='forelegR4'), + 12: + dict( + name='midlegL1', + id=12, + color=[255, 255, 255], + type='', + swap='midlegR1'), + 13: + dict( + name='midlegL2', + id=13, + color=[255, 255, 255], + type='', + swap='midlegR2'), + 14: + dict( + name='midlegL3', + id=14, + color=[255, 255, 255], + type='', + swap='midlegR3'), + 15: + dict( + name='midlegL4', + id=15, + color=[255, 255, 255], + type='', + swap='midlegR4'), + 16: + dict( + name='hindlegL1', + id=16, + color=[255, 255, 255], + type='', + swap='hindlegR1'), + 17: + dict( + name='hindlegL2', + id=17, + color=[255, 255, 255], + type='', + swap='hindlegR2'), + 18: + dict( + name='hindlegL3', + id=18, + color=[255, 255, 255], + type='', + swap='hindlegR3'), + 19: + dict( + name='hindlegL4', + id=19, + color=[255, 255, 255], + type='', + swap='hindlegR4'), + 20: + dict( + name='anttipR', + id=20, + color=[255, 255, 255], + type='', + swap='anttipL'), + 21: + dict( + name='antbaseR', + id=21, + color=[255, 255, 255], + type='', + swap='antbaseL'), + 22: + dict(name='eyeR', id=22, color=[255, 255, 255], type='', swap='eyeL'), + 23: + dict( + name='forelegR1', + id=23, + color=[255, 255, 255], + type='', + swap='forelegL1'), + 24: + dict( + name='forelegR2', + id=24, + color=[255, 255, 255], + type='', + swap='forelegL2'), + 25: + dict( + name='forelegR3', + id=25, + color=[255, 255, 255], + type='', + swap='forelegL3'), + 26: + dict( + name='forelegR4', + id=26, + color=[255, 255, 255], + type='', + swap='forelegL4'), + 27: + dict( + name='midlegR1', + id=27, + color=[255, 255, 255], + type='', + swap='midlegL1'), + 28: + dict( + name='midlegR2', + id=28, + color=[255, 255, 255], + type='', + swap='midlegL2'), + 29: + dict( + name='midlegR3', + id=29, + color=[255, 255, 255], + type='', + swap='midlegL3'), + 30: + dict( + name='midlegR4', + id=30, + color=[255, 255, 255], + type='', + swap='midlegL4'), + 31: + dict( + name='hindlegR1', + id=31, + color=[255, 255, 255], + type='', + swap='hindlegL1'), + 32: + dict( + name='hindlegR2', + id=32, + color=[255, 255, 255], + type='', + swap='hindlegL2'), + 33: + dict( + name='hindlegR3', + id=33, + color=[255, 255, 255], + type='', + swap='hindlegL3'), + 34: + dict( + name='hindlegR4', + id=34, + color=[255, 255, 255], + type='', + swap='hindlegL4') + }, + skeleton_info={ + 0: dict(link=('neck', 'head'), id=0, color=[255, 255, 255]), + 1: dict(link=('thorax', 'neck'), id=1, color=[255, 255, 255]), + 2: dict(link=('abdomen1', 'thorax'), id=2, color=[255, 255, 255]), + 3: dict(link=('abdomen2', 'abdomen1'), id=3, color=[255, 255, 255]), + 4: dict(link=('antbaseL', 'anttipL'), id=4, color=[255, 255, 255]), + 5: dict(link=('eyeL', 'antbaseL'), id=5, color=[255, 255, 255]), + 6: dict(link=('forelegL2', 'forelegL1'), id=6, color=[255, 255, 255]), + 7: dict(link=('forelegL3', 'forelegL2'), id=7, color=[255, 255, 255]), + 8: dict(link=('forelegL4', 'forelegL3'), id=8, color=[255, 255, 255]), + 9: dict(link=('midlegL2', 'midlegL1'), id=9, color=[255, 255, 255]), + 10: dict(link=('midlegL3', 'midlegL2'), id=10, color=[255, 255, 255]), + 11: dict(link=('midlegL4', 'midlegL3'), id=11, color=[255, 255, 255]), + 12: + dict(link=('hindlegL2', 'hindlegL1'), id=12, color=[255, 255, 255]), + 13: + dict(link=('hindlegL3', 'hindlegL2'), id=13, color=[255, 255, 255]), + 14: + dict(link=('hindlegL4', 'hindlegL3'), id=14, color=[255, 255, 255]), + 15: dict(link=('antbaseR', 'anttipR'), id=15, color=[255, 255, 255]), + 16: dict(link=('eyeR', 'antbaseR'), id=16, color=[255, 255, 255]), + 17: + dict(link=('forelegR2', 'forelegR1'), id=17, color=[255, 255, 255]), + 18: + dict(link=('forelegR3', 'forelegR2'), id=18, color=[255, 255, 255]), + 19: + dict(link=('forelegR4', 'forelegR3'), id=19, color=[255, 255, 255]), + 20: dict(link=('midlegR2', 'midlegR1'), id=20, color=[255, 255, 255]), + 21: dict(link=('midlegR3', 'midlegR2'), id=21, color=[255, 255, 255]), + 22: dict(link=('midlegR4', 'midlegR3'), id=22, color=[255, 255, 255]), + 23: + dict(link=('hindlegR2', 'hindlegR1'), id=23, color=[255, 255, 255]), + 24: + dict(link=('hindlegR3', 'hindlegR2'), id=24, color=[255, 255, 255]), + 25: + dict(link=('hindlegR4', 'hindlegR3'), id=25, color=[255, 255, 255]) + }, + joint_weights=[1.] * 35, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/macaque.py b/modules/rtmpose/configs/_base_/datasets/macaque.py new file mode 100644 index 0000000..926ca30 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/macaque.py @@ -0,0 +1,183 @@ +dataset_info = dict( + dataset_name='macaque', + paper_info=dict( + author='Labuguen, Rollyn and Matsumoto, Jumpei and ' + 'Negrete, Salvador and Nishimaru, Hiroshi and ' + 'Nishijo, Hisao and Takada, Masahiko and ' + 'Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro', + title='MacaquePose: A novel "in the wild" macaque monkey pose dataset ' + 'for markerless motion capture', + container='bioRxiv', + year='2020', + homepage='http://www.pri.kyoto-u.ac.jp/datasets/' + 'macaquepose/index.html', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/mhp.py b/modules/rtmpose/configs/_base_/datasets/mhp.py new file mode 100644 index 0000000..9c8c03c --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/mhp.py @@ -0,0 +1,156 @@ +dataset_info = dict( + dataset_name='mhp', + paper_info=dict( + author='Zhao, Jian and Li, Jianshu and Cheng, Yu and ' + 'Sim, Terence and Yan, Shuicheng and Feng, Jiashi', + title='Understanding humans in crowded scenes: ' + 'Deep nested adversarial learning and a ' + 'new benchmark for multi-human parsing', + container='Proceedings of the 26th ACM ' + 'international conference on Multimedia', + year='2018', + homepage='https://lv-mhp.github.io/dataset', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 1: + dict( + name='right_knee', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 2: + dict( + name='right_hip', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 3: + dict( + name='left_hip', + id=3, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 4: + dict( + name='left_knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 5: + dict( + name='left_ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 6: + dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''), + 7: + dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict( + name='upper_neck', + id=8, + color=[51, 153, 255], + type='upper', + swap=''), + 9: + dict( + name='head_top', id=9, color=[51, 153, 255], type='upper', + swap=''), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='right_elbow', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 12: + dict( + name='right_shoulder', + id=12, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 13: + dict( + name='left_shoulder', + id=13, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 14: + dict( + name='left_elbow', + id=14, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 15: + dict( + name='left_wrist', + id=15, + color=[0, 255, 0], + type='upper', + swap='right_wrist') + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]), + 3: + dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]), + 4: + dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), + 5: + dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), + 6: + dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]), + 7: + dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]), + 8: + dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]), + 9: + dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]), + 10: + dict( + link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128, + 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]), + 13: + dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]), + 14: + dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0]) + }, + joint_weights=[ + 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 + ], + # Adapted from COCO dataset. + sigmas=[ + 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, + 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/mpi_inf_3dhp.py b/modules/rtmpose/configs/_base_/datasets/mpi_inf_3dhp.py new file mode 100644 index 0000000..ed088c2 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/mpi_inf_3dhp.py @@ -0,0 +1,132 @@ +dataset_info = dict( + dataset_name='mpi_inf_3dhp', + paper_info=dict( + author='ehta, Dushyant and Rhodin, Helge and Casas, Dan and ' + 'Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and ' + 'Theobalt, Christian', + title='Monocular 3D Human Pose Estimation In The Wild Using Improved ' + 'CNN Supervision', + container='2017 international conference on 3D vision (3DV)', + year='2017', + homepage='http://gvv.mpi-inf.mpg.de/3dhp-dataset', + ), + keypoint_info={ + 0: + dict( + name='head_top', id=0, color=[51, 153, 255], type='upper', + swap=''), + 1: + dict(name='neck', id=1, color=[51, 153, 255], type='upper', swap=''), + 2: + dict( + name='right_shoulder', + id=2, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 3: + dict( + name='right_elbow', + id=3, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 4: + dict( + name='right_wrist', + id=4, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='left_elbow', + id=6, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 7: + dict( + name='left_wrist', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 8: + dict( + name='right_hip', + id=8, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 9: + dict( + name='right_knee', + id=9, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 10: + dict( + name='right_ankle', + id=10, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='left_knee', + id=12, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 13: + dict( + name='left_ankle', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 14: + dict(name='root', id=14, color=[51, 153, 255], type='lower', swap=''), + 15: + dict(name='spine', id=15, color=[51, 153, 255], type='upper', swap=''), + 16: + dict(name='head', id=16, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: dict(link=('neck', 'right_shoulder'), id=0, color=[255, 128, 0]), + 1: dict( + link=('right_shoulder', 'right_elbow'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_elbow', 'right_wrist'), id=2, color=[255, 128, 0]), + 3: dict(link=('neck', 'left_shoulder'), id=3, color=[0, 255, 0]), + 4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]), + 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: dict(link=('root', 'right_hip'), id=6, color=[255, 128, 0]), + 7: dict(link=('right_hip', 'right_knee'), id=7, color=[255, 128, 0]), + 8: dict(link=('right_knee', 'right_ankle'), id=8, color=[255, 128, 0]), + 9: dict(link=('root', 'left_hip'), id=9, color=[0, 255, 0]), + 10: dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]), + 11: dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 255, 0]), + 12: dict(link=('head_top', 'head'), id=12, color=[51, 153, 255]), + 13: dict(link=('head', 'neck'), id=13, color=[51, 153, 255]), + 14: dict(link=('neck', 'spine'), id=14, color=[51, 153, 255]), + 15: dict(link=('spine', 'root'), id=15, color=[51, 153, 255]) + }, + joint_weights=[1.] * 17, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/mpii.py b/modules/rtmpose/configs/_base_/datasets/mpii.py new file mode 100644 index 0000000..2723bae --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/mpii.py @@ -0,0 +1,155 @@ +dataset_info = dict( + dataset_name='mpii', + paper_info=dict( + author='Mykhaylo Andriluka and Leonid Pishchulin and ' + 'Peter Gehler and Schiele, Bernt', + title='2D Human Pose Estimation: New Benchmark and ' + 'State of the Art Analysis', + container='IEEE Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2014', + homepage='http://human-pose.mpi-inf.mpg.de/', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 1: + dict( + name='right_knee', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 2: + dict( + name='right_hip', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 3: + dict( + name='left_hip', + id=3, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 4: + dict( + name='left_knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 5: + dict( + name='left_ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 6: + dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''), + 7: + dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict( + name='upper_neck', + id=8, + color=[51, 153, 255], + type='upper', + swap=''), + 9: + dict( + name='head_top', id=9, color=[51, 153, 255], type='upper', + swap=''), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='right_elbow', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 12: + dict( + name='right_shoulder', + id=12, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 13: + dict( + name='left_shoulder', + id=13, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 14: + dict( + name='left_elbow', + id=14, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 15: + dict( + name='left_wrist', + id=15, + color=[0, 255, 0], + type='upper', + swap='right_wrist') + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]), + 3: + dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]), + 4: + dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), + 5: + dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), + 6: + dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]), + 7: + dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]), + 8: + dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]), + 9: + dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]), + 10: + dict( + link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128, + 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]), + 13: + dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]), + 14: + dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0]) + }, + joint_weights=[ + 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 + ], + # Adapted from COCO dataset. + sigmas=[ + 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, + 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/mpii_trb.py b/modules/rtmpose/configs/_base_/datasets/mpii_trb.py new file mode 100644 index 0000000..ddb7e9e --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/mpii_trb.py @@ -0,0 +1,380 @@ +dataset_info = dict( + dataset_name='mpii_trb', + paper_info=dict( + author='Duan, Haodong and Lin, Kwan-Yee and Jin, Sheng and ' + 'Liu, Wentao and Qian, Chen and Ouyang, Wanli', + title='TRB: A Novel Triplet Representation for ' + 'Understanding 2D Human Body', + container='Proceedings of the IEEE International ' + 'Conference on Computer Vision', + year='2019', + homepage='https://github.com/kennymckormick/' + 'Triplet-Representation-of-human-Body', + ), + keypoint_info={ + 0: + dict( + name='left_shoulder', + id=0, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 1: + dict( + name='right_shoulder', + id=1, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 2: + dict( + name='left_elbow', + id=2, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 3: + dict( + name='right_elbow', + id=3, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 4: + dict( + name='left_wrist', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 5: + dict( + name='right_wrist', + id=5, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 6: + dict( + name='left_hip', + id=6, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 7: + dict( + name='right_hip', + id=7, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 8: + dict( + name='left_knee', + id=8, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 9: + dict( + name='right_knee', + id=9, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 10: + dict( + name='left_ankle', + id=10, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 11: + dict( + name='right_ankle', + id=11, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 12: + dict(name='head', id=12, color=[51, 153, 255], type='upper', swap=''), + 13: + dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap=''), + 14: + dict( + name='right_neck', + id=14, + color=[255, 255, 255], + type='upper', + swap='left_neck'), + 15: + dict( + name='left_neck', + id=15, + color=[255, 255, 255], + type='upper', + swap='right_neck'), + 16: + dict( + name='medial_right_shoulder', + id=16, + color=[255, 255, 255], + type='upper', + swap='medial_left_shoulder'), + 17: + dict( + name='lateral_right_shoulder', + id=17, + color=[255, 255, 255], + type='upper', + swap='lateral_left_shoulder'), + 18: + dict( + name='medial_right_bow', + id=18, + color=[255, 255, 255], + type='upper', + swap='medial_left_bow'), + 19: + dict( + name='lateral_right_bow', + id=19, + color=[255, 255, 255], + type='upper', + swap='lateral_left_bow'), + 20: + dict( + name='medial_right_wrist', + id=20, + color=[255, 255, 255], + type='upper', + swap='medial_left_wrist'), + 21: + dict( + name='lateral_right_wrist', + id=21, + color=[255, 255, 255], + type='upper', + swap='lateral_left_wrist'), + 22: + dict( + name='medial_left_shoulder', + id=22, + color=[255, 255, 255], + type='upper', + swap='medial_right_shoulder'), + 23: + dict( + name='lateral_left_shoulder', + id=23, + color=[255, 255, 255], + type='upper', + swap='lateral_right_shoulder'), + 24: + dict( + name='medial_left_bow', + id=24, + color=[255, 255, 255], + type='upper', + swap='medial_right_bow'), + 25: + dict( + name='lateral_left_bow', + id=25, + color=[255, 255, 255], + type='upper', + swap='lateral_right_bow'), + 26: + dict( + name='medial_left_wrist', + id=26, + color=[255, 255, 255], + type='upper', + swap='medial_right_wrist'), + 27: + dict( + name='lateral_left_wrist', + id=27, + color=[255, 255, 255], + type='upper', + swap='lateral_right_wrist'), + 28: + dict( + name='medial_right_hip', + id=28, + color=[255, 255, 255], + type='lower', + swap='medial_left_hip'), + 29: + dict( + name='lateral_right_hip', + id=29, + color=[255, 255, 255], + type='lower', + swap='lateral_left_hip'), + 30: + dict( + name='medial_right_knee', + id=30, + color=[255, 255, 255], + type='lower', + swap='medial_left_knee'), + 31: + dict( + name='lateral_right_knee', + id=31, + color=[255, 255, 255], + type='lower', + swap='lateral_left_knee'), + 32: + dict( + name='medial_right_ankle', + id=32, + color=[255, 255, 255], + type='lower', + swap='medial_left_ankle'), + 33: + dict( + name='lateral_right_ankle', + id=33, + color=[255, 255, 255], + type='lower', + swap='lateral_left_ankle'), + 34: + dict( + name='medial_left_hip', + id=34, + color=[255, 255, 255], + type='lower', + swap='medial_right_hip'), + 35: + dict( + name='lateral_left_hip', + id=35, + color=[255, 255, 255], + type='lower', + swap='lateral_right_hip'), + 36: + dict( + name='medial_left_knee', + id=36, + color=[255, 255, 255], + type='lower', + swap='medial_right_knee'), + 37: + dict( + name='lateral_left_knee', + id=37, + color=[255, 255, 255], + type='lower', + swap='lateral_right_knee'), + 38: + dict( + name='medial_left_ankle', + id=38, + color=[255, 255, 255], + type='lower', + swap='medial_right_ankle'), + 39: + dict( + name='lateral_left_ankle', + id=39, + color=[255, 255, 255], + type='lower', + swap='lateral_right_ankle'), + }, + skeleton_info={ + 0: + dict(link=('head', 'neck'), id=0, color=[51, 153, 255]), + 1: + dict(link=('neck', 'left_shoulder'), id=1, color=[51, 153, 255]), + 2: + dict(link=('neck', 'right_shoulder'), id=2, color=[51, 153, 255]), + 3: + dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]), + 4: + dict( + link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]), + 5: + dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: + dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]), + 7: + dict(link=('left_shoulder', 'left_hip'), id=7, color=[51, 153, 255]), + 8: + dict(link=('right_shoulder', 'right_hip'), id=8, color=[51, 153, 255]), + 9: + dict(link=('left_hip', 'right_hip'), id=9, color=[51, 153, 255]), + 10: + dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_hip', 'right_knee'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_knee', 'left_ankle'), id=12, color=[0, 255, 0]), + 13: + dict(link=('right_knee', 'right_ankle'), id=13, color=[255, 128, 0]), + 14: + dict(link=('right_neck', 'left_neck'), id=14, color=[255, 255, 255]), + 15: + dict( + link=('medial_right_shoulder', 'lateral_right_shoulder'), + id=15, + color=[255, 255, 255]), + 16: + dict( + link=('medial_right_bow', 'lateral_right_bow'), + id=16, + color=[255, 255, 255]), + 17: + dict( + link=('medial_right_wrist', 'lateral_right_wrist'), + id=17, + color=[255, 255, 255]), + 18: + dict( + link=('medial_left_shoulder', 'lateral_left_shoulder'), + id=18, + color=[255, 255, 255]), + 19: + dict( + link=('medial_left_bow', 'lateral_left_bow'), + id=19, + color=[255, 255, 255]), + 20: + dict( + link=('medial_left_wrist', 'lateral_left_wrist'), + id=20, + color=[255, 255, 255]), + 21: + dict( + link=('medial_right_hip', 'lateral_right_hip'), + id=21, + color=[255, 255, 255]), + 22: + dict( + link=('medial_right_knee', 'lateral_right_knee'), + id=22, + color=[255, 255, 255]), + 23: + dict( + link=('medial_right_ankle', 'lateral_right_ankle'), + id=23, + color=[255, 255, 255]), + 24: + dict( + link=('medial_left_hip', 'lateral_left_hip'), + id=24, + color=[255, 255, 255]), + 25: + dict( + link=('medial_left_knee', 'lateral_left_knee'), + id=25, + color=[255, 255, 255]), + 26: + dict( + link=('medial_left_ankle', 'lateral_left_ankle'), + id=26, + color=[255, 255, 255]) + }, + joint_weights=[1.] * 40, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/ochuman.py b/modules/rtmpose/configs/_base_/datasets/ochuman.py new file mode 100644 index 0000000..e6e86ba --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/ochuman.py @@ -0,0 +1,181 @@ +dataset_info = dict( + dataset_name='ochuman', + paper_info=dict( + author='Zhang, Song-Hai and Li, Ruilong and Dong, Xin and ' + 'Rosin, Paul and Cai, Zixi and Han, Xi and ' + 'Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min', + title='Pose2seg: Detection free human instance segmentation', + container='Proceedings of the IEEE conference on computer ' + 'vision and pattern recognition', + year='2019', + homepage='https://github.com/liruilong940607/OCHumanApi', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/onehand10k.py b/modules/rtmpose/configs/_base_/datasets/onehand10k.py new file mode 100644 index 0000000..833f186 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/onehand10k.py @@ -0,0 +1,142 @@ +dataset_info = dict( + dataset_name='onehand10k', + paper_info=dict( + author='Wang, Yangang and Peng, Cong and Liu, Yebin', + title='Mask-pose cascaded cnn for 2d hand pose estimation ' + 'from single color image', + container='IEEE Transactions on Circuits and Systems ' + 'for Video Technology', + year='2018', + homepage='https://www.yangangwang.com/papers/WANG-MCC-2018-10.html', + ), + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger1', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger3', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger4', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/panoptic_body3d.py b/modules/rtmpose/configs/_base_/datasets/panoptic_body3d.py new file mode 100644 index 0000000..6623409 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/panoptic_body3d.py @@ -0,0 +1,160 @@ +dataset_info = dict( + dataset_name='panoptic_pose_3d', + paper_info=dict( + author='Joo, Hanbyul and Simon, Tomas and Li, Xulong' + 'and Liu, Hao and Tan, Lei and Gui, Lin and Banerjee, Sean' + 'and Godisart, Timothy and Nabbe, Bart and Matthews, Iain' + 'and Kanade, Takeo and Nobuhara, Shohei and Sheikh, Yaser', + title='Panoptic Studio: A Massively Multiview System ' + 'for Interaction Motion Capture', + container='IEEE Transactions on Pattern Analysis' + ' and Machine Intelligence', + year='2017', + homepage='http://domedb.perception.cs.cmu.edu', + ), + keypoint_info={ + 0: + dict(name='neck', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict(name='nose', id=1, color=[51, 153, 255], type='upper', swap=''), + 2: + dict(name='mid_hip', id=2, color=[0, 255, 0], type='lower', swap=''), + 3: + dict( + name='left_shoulder', + id=3, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 4: + dict( + name='left_elbow', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 5: + dict( + name='left_wrist', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 6: + dict( + name='left_hip', + id=6, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 7: + dict( + name='left_knee', + id=7, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 8: + dict( + name='left_ankle', + id=8, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 9: + dict( + name='right_shoulder', + id=9, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 10: + dict( + name='right_elbow', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 11: + dict( + name='right_wrist', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='right_knee', + id=13, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 14: + dict( + name='right_ankle', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 15: + dict( + name='left_eye', + id=15, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 16: + dict( + name='left_ear', + id=16, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 17: + dict( + name='right_eye', + id=17, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 18: + dict( + name='right_ear', + id=18, + color=[51, 153, 255], + type='upper', + swap='left_ear') + }, + skeleton_info={ + 0: dict(link=('nose', 'neck'), id=0, color=[51, 153, 255]), + 1: dict(link=('neck', 'left_shoulder'), id=1, color=[0, 255, 0]), + 2: dict(link=('neck', 'right_shoulder'), id=2, color=[255, 128, 0]), + 3: dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]), + 4: dict( + link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]), + 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: + dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]), + 7: dict(link=('left_ankle', 'left_knee'), id=7, color=[0, 255, 0]), + 8: dict(link=('left_knee', 'left_hip'), id=8, color=[0, 255, 0]), + 9: dict(link=('right_ankle', 'right_knee'), id=9, color=[255, 128, 0]), + 10: dict(link=('right_knee', 'right_hip'), id=10, color=[255, 128, 0]), + 11: dict(link=('mid_hip', 'left_hip'), id=11, color=[0, 255, 0]), + 12: dict(link=('mid_hip', 'right_hip'), id=12, color=[255, 128, 0]), + 13: dict(link=('mid_hip', 'neck'), id=13, color=[51, 153, 255]), + }, + joint_weights=[ + 1.0, 1.0, 1.0, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2, + 1.5, 1.0, 1.0, 1.0, 1.0 + ], + sigmas=[ + 0.026, 0.026, 0.107, 0.079, 0.072, 0.062, 0.107, 0.087, 0.089, 0.079, + 0.072, 0.062, 0.107, 0.087, 0.089, 0.025, 0.035, 0.025, 0.035 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/panoptic_hand2d.py b/modules/rtmpose/configs/_base_/datasets/panoptic_hand2d.py new file mode 100644 index 0000000..5d01b9a --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/panoptic_hand2d.py @@ -0,0 +1,143 @@ +dataset_info = dict( + dataset_name='panoptic_hand2d', + paper_info=dict( + author='Simon, Tomas and Joo, Hanbyul and ' + 'Matthews, Iain and Sheikh, Yaser', + title='Hand keypoint detection in single images using ' + 'multiview bootstrapping', + container='Proceedings of the IEEE conference on ' + 'Computer Vision and Pattern Recognition', + year='2017', + homepage='http://domedb.perception.cs.cmu.edu/handdb.html', + ), + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger1', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger2', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger3', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger4', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/posetrack18.py b/modules/rtmpose/configs/_base_/datasets/posetrack18.py new file mode 100644 index 0000000..18e1891 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/posetrack18.py @@ -0,0 +1,176 @@ +dataset_info = dict( + dataset_name='posetrack18', + paper_info=dict( + author='Andriluka, Mykhaylo and Iqbal, Umar and ' + 'Insafutdinov, Eldar and Pishchulin, Leonid and ' + 'Milan, Anton and Gall, Juergen and Schiele, Bernt', + title='Posetrack: A benchmark for human pose estimation and tracking', + container='Proceedings of the IEEE Conference on ' + 'Computer Vision and Pattern Recognition', + year='2018', + homepage='https://posetrack.net/users/download.php', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='head_bottom', + id=1, + color=[51, 153, 255], + type='upper', + swap=''), + 2: + dict( + name='head_top', id=2, color=[51, 153, 255], type='upper', + swap=''), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('nose', 'head_bottom'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'head_top'), id=13, color=[51, 153, 255]), + 14: + dict( + link=('head_bottom', 'left_shoulder'), id=14, color=[51, 153, + 255]), + 15: + dict( + link=('head_bottom', 'right_shoulder'), + id=15, + color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/rhd2d.py b/modules/rtmpose/configs/_base_/datasets/rhd2d.py new file mode 100644 index 0000000..8829b15 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/rhd2d.py @@ -0,0 +1,151 @@ +dataset_info = dict( + dataset_name='rhd2d', + paper_info=dict( + author='Christian Zimmermann and Thomas Brox', + title='Learning to Estimate 3D Hand Pose from Single RGB Images', + container='arXiv', + year='2017', + homepage='https://lmb.informatik.uni-freiburg.de/resources/' + 'datasets/RenderedHandposeDataset.en.html', + ), + # In RHD, 1-4: left thumb [tip to palm], which means the finger is from + # tip to palm, so as other fingers. Please refer to + # `https://lmb.informatik.uni-freiburg.de/resources/datasets/ + # RenderedHandpose/README` for details of keypoint definition. + # But in COCO-WholeBody-Hand, FreiHand, CMU Panoptic HandDB, it is in + # inverse order. Pay attention to this if you want to combine RHD with + # other hand datasets to train a single model. + # Also, note that 'keypoint_info' will not directly affect the order of + # the keypoint in the dataset. It is mostly for visualization & storing + # information about flip_pairs. + keypoint_info={ + 0: + dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='thumb4', id=1, color=[255, 128, 0], type='', swap=''), + 2: + dict(name='thumb3', id=2, color=[255, 128, 0], type='', swap=''), + 3: + dict(name='thumb2', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='thumb1', id=4, color=[255, 128, 0], type='', swap=''), + 5: + dict( + name='forefinger4', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict( + name='forefinger3', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict( + name='forefinger2', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict( + name='forefinger1', id=8, color=[255, 153, 255], type='', swap=''), + 9: + dict( + name='middle_finger4', + id=9, + color=[102, 178, 255], + type='', + swap=''), + 10: + dict( + name='middle_finger3', + id=10, + color=[102, 178, 255], + type='', + swap=''), + 11: + dict( + name='middle_finger2', + id=11, + color=[102, 178, 255], + type='', + swap=''), + 12: + dict( + name='middle_finger1', + id=12, + color=[102, 178, 255], + type='', + swap=''), + 13: + dict( + name='ring_finger4', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict( + name='ring_finger3', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict( + name='ring_finger2', id=15, color=[255, 51, 51], type='', swap=''), + 16: + dict( + name='ring_finger1', id=16, color=[255, 51, 51], type='', swap=''), + 17: + dict(name='pinky_finger4', id=17, color=[0, 255, 0], type='', swap=''), + 18: + dict(name='pinky_finger3', id=18, color=[0, 255, 0], type='', swap=''), + 19: + dict(name='pinky_finger2', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='pinky_finger1', id=20, color=[0, 255, 0], type='', swap='') + }, + skeleton_info={ + 0: + dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]), + 1: + dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]), + 2: + dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]), + 3: + dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]), + 4: + dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]), + 5: + dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]), + 6: + dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]), + 7: + dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]), + 8: + dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]), + 9: + dict( + link=('middle_finger1', 'middle_finger2'), + id=9, + color=[102, 178, 255]), + 10: + dict( + link=('middle_finger2', 'middle_finger3'), + id=10, + color=[102, 178, 255]), + 11: + dict( + link=('middle_finger3', 'middle_finger4'), + id=11, + color=[102, 178, 255]), + 12: + dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]), + 13: + dict( + link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]), + 14: + dict( + link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]), + 15: + dict( + link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]), + 16: + dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]), + 17: + dict( + link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]), + 18: + dict( + link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]), + 19: + dict( + link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0]) + }, + joint_weights=[1.] * 21, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/shelf.py b/modules/rtmpose/configs/_base_/datasets/shelf.py new file mode 100644 index 0000000..6a7984b --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/shelf.py @@ -0,0 +1,151 @@ +dataset_info = dict( + dataset_name='shelf', + paper_info=dict( + author='Belagiannis, Vasileios and Amin, Sikandar and Andriluka, ' + 'Mykhaylo and Schiele, Bernt and Navab, Nassir and Ilic, Slobodan', + title='3D Pictorial Structures for Multiple Human Pose Estimation', + container='IEEE Computer Society Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2014', + homepage='http://campar.in.tum.de/Chair/MultiHumanPose', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 1: + dict( + name='right_knee', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 2: + dict( + name='right_hip', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 3: + dict( + name='left_hip', + id=3, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 4: + dict( + name='left_knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 5: + dict( + name='left_ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 6: + dict( + name='right_wrist', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 7: + dict( + name='right_elbow', + id=7, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 8: + dict( + name='right_shoulder', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 9: + dict( + name='left_shoulder', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 10: + dict( + name='left_elbow', + id=10, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 11: + dict( + name='left_wrist', + id=11, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 12: + dict( + name='bottom_head', + id=12, + color=[51, 153, 255], + type='upper', + swap=''), + 13: + dict( + name='top_head', + id=13, + color=[51, 153, 255], + type='upper', + swap=''), + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: + dict(link=('left_hip', 'left_knee'), id=2, color=[0, 255, 0]), + 3: + dict(link=('left_knee', 'left_ankle'), id=3, color=[0, 255, 0]), + 4: + dict(link=('right_hip', 'left_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('right_wrist', 'right_elbow'), id=5, color=[255, 128, 0]), + 6: + dict( + link=('right_elbow', 'right_shoulder'), id=6, color=[255, 128, 0]), + 7: + dict(link=('left_shoulder', 'left_elbow'), id=7, color=[0, 255, 0]), + 8: + dict(link=('left_elbow', 'left_wrist'), id=8, color=[0, 255, 0]), + 9: + dict(link=('right_hip', 'right_shoulder'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_hip', 'left_shoulder'), id=10, color=[0, 255, 0]), + 11: + dict( + link=('right_shoulder', 'bottom_head'), id=11, color=[255, 128, + 0]), + 12: + dict(link=('left_shoulder', 'bottom_head'), id=12, color=[0, 255, 0]), + 13: + dict(link=('bottom_head', 'top_head'), id=13, color=[51, 153, 255]), + }, + joint_weights=[ + 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.0, 1.0 + ], + sigmas=[ + 0.089, 0.087, 0.107, 0.107, 0.087, 0.089, 0.062, 0.072, 0.079, 0.079, + 0.072, 0.062, 0.026, 0.026 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/ubody2d.py b/modules/rtmpose/configs/_base_/datasets/ubody2d.py new file mode 100644 index 0000000..8e63299 --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/ubody2d.py @@ -0,0 +1,1153 @@ +dataset_info = dict( + dataset_name='ubody2d', + paper_info=dict( + author='Jing Lin, Ailing Zeng, Haoqian Wang, Lei Zhang, Yu Li', + title='One-Stage 3D Whole-Body Mesh Recovery with Component Aware' + 'Transformer', + container='IEEE Computer Society Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2023', + homepage='https://github.com/IDEA-Research/OSX', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='left_big_toe', + id=17, + color=[255, 128, 0], + type='lower', + swap='right_big_toe'), + 18: + dict( + name='left_small_toe', + id=18, + color=[255, 128, 0], + type='lower', + swap='right_small_toe'), + 19: + dict( + name='left_heel', + id=19, + color=[255, 128, 0], + type='lower', + swap='right_heel'), + 20: + dict( + name='right_big_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='left_big_toe'), + 21: + dict( + name='right_small_toe', + id=21, + color=[255, 128, 0], + type='lower', + swap='left_small_toe'), + 22: + dict( + name='right_heel', + id=22, + color=[255, 128, 0], + type='lower', + swap='left_heel'), + 23: + dict( + name='face-0', + id=23, + color=[255, 255, 255], + type='', + swap='face-16'), + 24: + dict( + name='face-1', + id=24, + color=[255, 255, 255], + type='', + swap='face-15'), + 25: + dict( + name='face-2', + id=25, + color=[255, 255, 255], + type='', + swap='face-14'), + 26: + dict( + name='face-3', + id=26, + color=[255, 255, 255], + type='', + swap='face-13'), + 27: + dict( + name='face-4', + id=27, + color=[255, 255, 255], + type='', + swap='face-12'), + 28: + dict( + name='face-5', + id=28, + color=[255, 255, 255], + type='', + swap='face-11'), + 29: + dict( + name='face-6', + id=29, + color=[255, 255, 255], + type='', + swap='face-10'), + 30: + dict( + name='face-7', + id=30, + color=[255, 255, 255], + type='', + swap='face-9'), + 31: + dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''), + 32: + dict( + name='face-9', + id=32, + color=[255, 255, 255], + type='', + swap='face-7'), + 33: + dict( + name='face-10', + id=33, + color=[255, 255, 255], + type='', + swap='face-6'), + 34: + dict( + name='face-11', + id=34, + color=[255, 255, 255], + type='', + swap='face-5'), + 35: + dict( + name='face-12', + id=35, + color=[255, 255, 255], + type='', + swap='face-4'), + 36: + dict( + name='face-13', + id=36, + color=[255, 255, 255], + type='', + swap='face-3'), + 37: + dict( + name='face-14', + id=37, + color=[255, 255, 255], + type='', + swap='face-2'), + 38: + dict( + name='face-15', + id=38, + color=[255, 255, 255], + type='', + swap='face-1'), + 39: + dict( + name='face-16', + id=39, + color=[255, 255, 255], + type='', + swap='face-0'), + 40: + dict( + name='face-17', + id=40, + color=[255, 255, 255], + type='', + swap='face-26'), + 41: + dict( + name='face-18', + id=41, + color=[255, 255, 255], + type='', + swap='face-25'), + 42: + dict( + name='face-19', + id=42, + color=[255, 255, 255], + type='', + swap='face-24'), + 43: + dict( + name='face-20', + id=43, + color=[255, 255, 255], + type='', + swap='face-23'), + 44: + dict( + name='face-21', + id=44, + color=[255, 255, 255], + type='', + swap='face-22'), + 45: + dict( + name='face-22', + id=45, + color=[255, 255, 255], + type='', + swap='face-21'), + 46: + dict( + name='face-23', + id=46, + color=[255, 255, 255], + type='', + swap='face-20'), + 47: + dict( + name='face-24', + id=47, + color=[255, 255, 255], + type='', + swap='face-19'), + 48: + dict( + name='face-25', + id=48, + color=[255, 255, 255], + type='', + swap='face-18'), + 49: + dict( + name='face-26', + id=49, + color=[255, 255, 255], + type='', + swap='face-17'), + 50: + dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''), + 51: + dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''), + 52: + dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''), + 53: + dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''), + 54: + dict( + name='face-31', + id=54, + color=[255, 255, 255], + type='', + swap='face-35'), + 55: + dict( + name='face-32', + id=55, + color=[255, 255, 255], + type='', + swap='face-34'), + 56: + dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''), + 57: + dict( + name='face-34', + id=57, + color=[255, 255, 255], + type='', + swap='face-32'), + 58: + dict( + name='face-35', + id=58, + color=[255, 255, 255], + type='', + swap='face-31'), + 59: + dict( + name='face-36', + id=59, + color=[255, 255, 255], + type='', + swap='face-45'), + 60: + dict( + name='face-37', + id=60, + color=[255, 255, 255], + type='', + swap='face-44'), + 61: + dict( + name='face-38', + id=61, + color=[255, 255, 255], + type='', + swap='face-43'), + 62: + dict( + name='face-39', + id=62, + color=[255, 255, 255], + type='', + swap='face-42'), + 63: + dict( + name='face-40', + id=63, + color=[255, 255, 255], + type='', + swap='face-47'), + 64: + dict( + name='face-41', + id=64, + color=[255, 255, 255], + type='', + swap='face-46'), + 65: + dict( + name='face-42', + id=65, + color=[255, 255, 255], + type='', + swap='face-39'), + 66: + dict( + name='face-43', + id=66, + color=[255, 255, 255], + type='', + swap='face-38'), + 67: + dict( + name='face-44', + id=67, + color=[255, 255, 255], + type='', + swap='face-37'), + 68: + dict( + name='face-45', + id=68, + color=[255, 255, 255], + type='', + swap='face-36'), + 69: + dict( + name='face-46', + id=69, + color=[255, 255, 255], + type='', + swap='face-41'), + 70: + dict( + name='face-47', + id=70, + color=[255, 255, 255], + type='', + swap='face-40'), + 71: + dict( + name='face-48', + id=71, + color=[255, 255, 255], + type='', + swap='face-54'), + 72: + dict( + name='face-49', + id=72, + color=[255, 255, 255], + type='', + swap='face-53'), + 73: + dict( + name='face-50', + id=73, + color=[255, 255, 255], + type='', + swap='face-52'), + 74: + dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''), + 75: + dict( + name='face-52', + id=75, + color=[255, 255, 255], + type='', + swap='face-50'), + 76: + dict( + name='face-53', + id=76, + color=[255, 255, 255], + type='', + swap='face-49'), + 77: + dict( + name='face-54', + id=77, + color=[255, 255, 255], + type='', + swap='face-48'), + 78: + dict( + name='face-55', + id=78, + color=[255, 255, 255], + type='', + swap='face-59'), + 79: + dict( + name='face-56', + id=79, + color=[255, 255, 255], + type='', + swap='face-58'), + 80: + dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''), + 81: + dict( + name='face-58', + id=81, + color=[255, 255, 255], + type='', + swap='face-56'), + 82: + dict( + name='face-59', + id=82, + color=[255, 255, 255], + type='', + swap='face-55'), + 83: + dict( + name='face-60', + id=83, + color=[255, 255, 255], + type='', + swap='face-64'), + 84: + dict( + name='face-61', + id=84, + color=[255, 255, 255], + type='', + swap='face-63'), + 85: + dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''), + 86: + dict( + name='face-63', + id=86, + color=[255, 255, 255], + type='', + swap='face-61'), + 87: + dict( + name='face-64', + id=87, + color=[255, 255, 255], + type='', + swap='face-60'), + 88: + dict( + name='face-65', + id=88, + color=[255, 255, 255], + type='', + swap='face-67'), + 89: + dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''), + 90: + dict( + name='face-67', + id=90, + color=[255, 255, 255], + type='', + swap='face-65'), + 91: + dict( + name='left_hand_root', + id=91, + color=[255, 255, 255], + type='', + swap='right_hand_root'), + 92: + dict( + name='left_thumb1', + id=92, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 93: + dict( + name='left_thumb2', + id=93, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 94: + dict( + name='left_thumb3', + id=94, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 95: + dict( + name='left_thumb4', + id=95, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 96: + dict( + name='left_forefinger1', + id=96, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 97: + dict( + name='left_forefinger2', + id=97, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 98: + dict( + name='left_forefinger3', + id=98, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 99: + dict( + name='left_forefinger4', + id=99, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 100: + dict( + name='left_middle_finger1', + id=100, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 101: + dict( + name='left_middle_finger2', + id=101, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 102: + dict( + name='left_middle_finger3', + id=102, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 103: + dict( + name='left_middle_finger4', + id=103, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 104: + dict( + name='left_ring_finger1', + id=104, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 105: + dict( + name='left_ring_finger2', + id=105, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 106: + dict( + name='left_ring_finger3', + id=106, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 107: + dict( + name='left_ring_finger4', + id=107, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 108: + dict( + name='left_pinky_finger1', + id=108, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 109: + dict( + name='left_pinky_finger2', + id=109, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 110: + dict( + name='left_pinky_finger3', + id=110, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 111: + dict( + name='left_pinky_finger4', + id=111, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 112: + dict( + name='right_hand_root', + id=112, + color=[255, 255, 255], + type='', + swap='left_hand_root'), + 113: + dict( + name='right_thumb1', + id=113, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 114: + dict( + name='right_thumb2', + id=114, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 115: + dict( + name='right_thumb3', + id=115, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 116: + dict( + name='right_thumb4', + id=116, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 117: + dict( + name='right_forefinger1', + id=117, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 118: + dict( + name='right_forefinger2', + id=118, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 119: + dict( + name='right_forefinger3', + id=119, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 120: + dict( + name='right_forefinger4', + id=120, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 121: + dict( + name='right_middle_finger1', + id=121, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 122: + dict( + name='right_middle_finger2', + id=122, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 123: + dict( + name='right_middle_finger3', + id=123, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 124: + dict( + name='right_middle_finger4', + id=124, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 125: + dict( + name='right_ring_finger1', + id=125, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 126: + dict( + name='right_ring_finger2', + id=126, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 127: + dict( + name='right_ring_finger3', + id=127, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 128: + dict( + name='right_ring_finger4', + id=128, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 129: + dict( + name='right_pinky_finger1', + id=129, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 130: + dict( + name='right_pinky_finger2', + id=130, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 131: + dict( + name='right_pinky_finger3', + id=131, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 132: + dict( + name='right_pinky_finger4', + id=132, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]), + 20: + dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]), + 21: + dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]), + 22: + dict( + link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]), + 23: + dict( + link=('right_ankle', 'right_small_toe'), + id=23, + color=[255, 128, 0]), + 24: + dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128, + 0]), + 26: + dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]), + 27: + dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]), + 28: + dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]), + 29: + dict( + link=('left_hand_root', 'left_forefinger1'), + id=29, + color=[255, 153, 255]), + 30: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=30, + color=[255, 153, 255]), + 31: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=31, + color=[255, 153, 255]), + 32: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=32, + color=[255, 153, 255]), + 33: + dict( + link=('left_hand_root', 'left_middle_finger1'), + id=33, + color=[102, 178, 255]), + 34: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=34, + color=[102, 178, 255]), + 35: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=35, + color=[102, 178, 255]), + 36: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=36, + color=[102, 178, 255]), + 37: + dict( + link=('left_hand_root', 'left_ring_finger1'), + id=37, + color=[255, 51, 51]), + 38: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=38, + color=[255, 51, 51]), + 39: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=39, + color=[255, 51, 51]), + 40: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=40, + color=[255, 51, 51]), + 41: + dict( + link=('left_hand_root', 'left_pinky_finger1'), + id=41, + color=[0, 255, 0]), + 42: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=42, + color=[0, 255, 0]), + 43: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=43, + color=[0, 255, 0]), + 44: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=44, + color=[0, 255, 0]), + 45: + dict( + link=('right_hand_root', 'right_thumb1'), + id=45, + color=[255, 128, 0]), + 46: + dict( + link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]), + 47: + dict( + link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]), + 48: + dict( + link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]), + 49: + dict( + link=('right_hand_root', 'right_forefinger1'), + id=49, + color=[255, 153, 255]), + 50: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=50, + color=[255, 153, 255]), + 51: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=51, + color=[255, 153, 255]), + 52: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=52, + color=[255, 153, 255]), + 53: + dict( + link=('right_hand_root', 'right_middle_finger1'), + id=53, + color=[102, 178, 255]), + 54: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=54, + color=[102, 178, 255]), + 55: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=55, + color=[102, 178, 255]), + 56: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=56, + color=[102, 178, 255]), + 57: + dict( + link=('right_hand_root', 'right_ring_finger1'), + id=57, + color=[255, 51, 51]), + 58: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=58, + color=[255, 51, 51]), + 59: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=59, + color=[255, 51, 51]), + 60: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=60, + color=[255, 51, 51]), + 61: + dict( + link=('right_hand_root', 'right_pinky_finger1'), + id=61, + color=[0, 255, 0]), + 62: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=62, + color=[0, 255, 0]), + 63: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=63, + color=[0, 255, 0]), + 64: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=64, + color=[0, 255, 0]) + }, + joint_weights=[1.] * 133, + # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/' + # 'evaluation/myeval_wholebody.py#L175' + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066, + 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, + 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, + 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, + 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, + 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, + 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, + 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, + 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, + 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, + 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, + 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, + 0.019, 0.022, 0.031 + ]) diff --git a/modules/rtmpose/configs/_base_/datasets/ubody3d.py b/modules/rtmpose/configs/_base_/datasets/ubody3d.py new file mode 100644 index 0000000..313b7df --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/ubody3d.py @@ -0,0 +1,958 @@ +dataset_info = dict( + dataset_name='ubody3d', + paper_info=dict( + author='Jing Lin, Ailing Zeng, Haoqian Wang, Lei Zhang, Yu Li', + title='One-Stage 3D Whole-Body Mesh Recovery with Component Aware' + 'Transformer', + container='IEEE Computer Society Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2023', + homepage='https://github.com/IDEA-Research/OSX', + ), + keypoint_info={ + 0: + dict(name='Pelvis', id=0, color=[0, 255, 0], type='', swap=''), + 1: + dict( + name='L_Hip', id=1, color=[0, 255, 0], type='lower', swap='R_Hip'), + 2: + dict( + name='R_Hip', id=2, color=[0, 255, 0], type='lower', swap='L_Hip'), + 3: + dict( + name='L_Knee', + id=3, + color=[0, 255, 0], + type='lower', + swap='R_Knee'), + 4: + dict( + name='R_Knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='L_Knee'), + 5: + dict( + name='L_Ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='R_Ankle'), + 6: + dict( + name='R_Ankle', + id=6, + color=[0, 255, 0], + type='lower', + swap='L_Ankle'), + 7: + dict(name='Neck', id=7, color=[0, 255, 0], type='upper', swap=''), + 8: + dict( + name='L_Shoulder', + id=8, + color=[0, 255, 0], + type='upper', + swap='R_Shoulder'), + 9: + dict( + name='R_Shoulder', + id=9, + color=[0, 255, 0], + type='upper', + swap='L_Shoulder'), + 10: + dict( + name='L_Elbow', + id=10, + color=[0, 255, 0], + type='upper', + swap='R_Elbow'), + 11: + dict( + name='R_Elbow', + id=11, + color=[0, 255, 0], + type='upper', + swap='L_Elbow'), + 12: + dict( + name='L_Wrist', + id=12, + color=[0, 255, 0], + type='upper', + swap='R_Wrist'), + 13: + dict( + name='R_Wrist', + id=13, + color=[0, 255, 0], + type='upper', + swap='L_Wrist'), + 14: + dict( + name='L_Big_toe', + id=14, + color=[0, 255, 0], + type='lower', + swap='R_Big_toe'), + 15: + dict( + name='L_Small_toe', + id=15, + color=[0, 255, 0], + type='lower', + swap='R_Small_toe'), + 16: + dict( + name='L_Heel', + id=16, + color=[0, 255, 0], + type='lower', + swap='R_Heel'), + 17: + dict( + name='R_Big_toe', + id=17, + color=[0, 255, 0], + type='lower', + swap='L_Big_toe'), + 18: + dict( + name='R_Small_toe', + id=18, + color=[0, 255, 0], + type='lower', + swap='L_Small_toe'), + 19: + dict( + name='R_Heel', + id=19, + color=[0, 255, 0], + type='lower', + swap='L_Heel'), + 20: + dict( + name='L_Ear', id=20, color=[0, 255, 0], type='upper', + swap='R_Ear'), + 21: + dict( + name='R_Ear', id=21, color=[0, 255, 0], type='upper', + swap='L_Ear'), + 22: + dict(name='L_Eye', id=22, color=[0, 255, 0], type='', swap='R_Eye'), + 23: + dict(name='R_Eye', id=23, color=[0, 255, 0], type='', swap='L_Eye'), + 24: + dict(name='Nose', id=24, color=[0, 255, 0], type='upper', swap=''), + 25: + dict( + name='L_Thumb_1', + id=25, + color=[255, 128, 0], + type='', + swap='R_Thumb_1'), + 26: + dict( + name='L_Thumb_2', + id=26, + color=[255, 128, 0], + type='', + swap='R_Thumb_2'), + 27: + dict( + name='L_Thumb_3', + id=27, + color=[255, 128, 0], + type='', + swap='R_Thumb_3'), + 28: + dict( + name='L_Thumb_4', + id=28, + color=[255, 128, 0], + type='', + swap='R_Thumb_4'), + 29: + dict( + name='L_Index_1', + id=29, + color=[255, 128, 0], + type='', + swap='R_Index_1'), + 30: + dict( + name='L_Index_2', + id=30, + color=[255, 128, 0], + type='', + swap='R_Index_2'), + 31: + dict( + name='L_Index_3', + id=31, + color=[255, 128, 0], + type='', + swap='R_Index_3'), + 32: + dict( + name='L_Index_4', + id=32, + color=[255, 128, 0], + type='', + swap='R_Index_4'), + 33: + dict( + name='L_Middle_1', + id=33, + color=[255, 128, 0], + type='', + swap='R_Middle_1'), + 34: + dict( + name='L_Middle_2', + id=34, + color=[255, 128, 0], + type='', + swap='R_Middle_2'), + 35: + dict( + name='L_Middle_3', + id=35, + color=[255, 128, 0], + type='', + swap='R_Middle_3'), + 36: + dict( + name='L_Middle_4', + id=36, + color=[255, 128, 0], + type='', + swap='R_Middle_4'), + 37: + dict( + name='L_Ring_1', + id=37, + color=[255, 128, 0], + type='', + swap='R_Ring_1'), + 38: + dict( + name='L_Ring_2', + id=38, + color=[255, 128, 0], + type='', + swap='R_Ring_2'), + 39: + dict( + name='L_Ring_3', + id=39, + color=[255, 128, 0], + type='', + swap='R_Ring_3'), + 40: + dict( + name='L_Ring_4', + id=40, + color=[255, 128, 0], + type='', + swap='R_Ring_4'), + 41: + dict( + name='L_Pinky_1', + id=41, + color=[255, 128, 0], + type='', + swap='R_Pinky_1'), + 42: + dict( + name='L_Pinky_2', + id=42, + color=[255, 128, 0], + type='', + swap='R_Pinky_2'), + 43: + dict( + name='L_Pinky_3', + id=43, + color=[255, 128, 0], + type='', + swap='R_Pinky_3'), + 44: + dict( + name='L_Pinky_4', + id=44, + color=[255, 128, 0], + type='', + swap='R_Pinky_4'), + 45: + dict( + name='R_Thumb_1', + id=45, + color=[255, 128, 0], + type='', + swap='L_Thumb_1'), + 46: + dict( + name='R_Thumb_2', + id=46, + color=[255, 128, 0], + type='', + swap='L_Thumb_2'), + 47: + dict( + name='R_Thumb_3', + id=47, + color=[255, 128, 0], + type='', + swap='L_Thumb_3'), + 48: + dict( + name='R_Thumb_4', + id=48, + color=[255, 128, 0], + type='', + swap='L_Thumb_4'), + 49: + dict( + name='R_Index_1', + id=49, + color=[255, 128, 0], + type='', + swap='L_Index_1'), + 50: + dict( + name='R_Index_2', + id=50, + color=[255, 128, 0], + type='', + swap='L_Index_2'), + 51: + dict( + name='R_Index_3', + id=51, + color=[255, 128, 0], + type='', + swap='L_Index_3'), + 52: + dict( + name='R_Index_4', + id=52, + color=[255, 128, 0], + type='', + swap='L_Index_4'), + 53: + dict( + name='R_Middle_1', + id=53, + color=[255, 128, 0], + type='', + swap='L_Middle_1'), + 54: + dict( + name='R_Middle_2', + id=54, + color=[255, 128, 0], + type='', + swap='L_Middle_2'), + 55: + dict( + name='R_Middle_3', + id=55, + color=[255, 128, 0], + type='', + swap='L_Middle_3'), + 56: + dict( + name='R_Middle_4', + id=56, + color=[255, 128, 0], + type='', + swap='L_Middle_4'), + 57: + dict( + name='R_Ring_1', + id=57, + color=[255, 128, 0], + type='', + swap='L_Ring_1'), + 58: + dict( + name='R_Ring_2', + id=58, + color=[255, 128, 0], + type='', + swap='L_Ring_2'), + 59: + dict( + name='R_Ring_3', + id=59, + color=[255, 128, 0], + type='', + swap='L_Ring_3'), + 60: + dict( + name='R_Ring_4', + id=60, + color=[255, 128, 0], + type='', + swap='L_Ring_4'), + 61: + dict( + name='R_Pinky_1', + id=61, + color=[255, 128, 0], + type='', + swap='L_Pinky_1'), + 62: + dict( + name='R_Pinky_2', + id=62, + color=[255, 128, 0], + type='', + swap='L_Pinky_2'), + 63: + dict( + name='R_Pinky_3', + id=63, + color=[255, 128, 0], + type='', + swap='L_Pinky_3'), + 64: + dict( + name='R_Pinky_4', + id=64, + color=[255, 128, 0], + type='', + swap='L_Pinky_4'), + 65: + dict(name='Face_1', id=65, color=[255, 255, 255], type='', swap=''), + 66: + dict(name='Face_2', id=66, color=[255, 255, 255], type='', swap=''), + 67: + dict( + name='Face_3', + id=67, + color=[255, 255, 255], + type='', + swap='Face_4'), + 68: + dict( + name='Face_4', + id=68, + color=[255, 255, 255], + type='', + swap='Face_3'), + 69: + dict( + name='Face_5', + id=69, + color=[255, 255, 255], + type='', + swap='Face_14'), + 70: + dict( + name='Face_6', + id=70, + color=[255, 255, 255], + type='', + swap='Face_13'), + 71: + dict( + name='Face_7', + id=71, + color=[255, 255, 255], + type='', + swap='Face_12'), + 72: + dict( + name='Face_8', + id=72, + color=[255, 255, 255], + type='', + swap='Face_11'), + 73: + dict( + name='Face_9', + id=73, + color=[255, 255, 255], + type='', + swap='Face_10'), + 74: + dict( + name='Face_10', + id=74, + color=[255, 255, 255], + type='', + swap='Face_9'), + 75: + dict( + name='Face_11', + id=75, + color=[255, 255, 255], + type='', + swap='Face_8'), + 76: + dict( + name='Face_12', + id=76, + color=[255, 255, 255], + type='', + swap='Face_7'), + 77: + dict( + name='Face_13', + id=77, + color=[255, 255, 255], + type='', + swap='Face_6'), + 78: + dict( + name='Face_14', + id=78, + color=[255, 255, 255], + type='', + swap='Face_5'), + 79: + dict(name='Face_15', id=79, color=[255, 255, 255], type='', swap=''), + 80: + dict(name='Face_16', id=80, color=[255, 255, 255], type='', swap=''), + 81: + dict(name='Face_17', id=81, color=[255, 255, 255], type='', swap=''), + 82: + dict(name='Face_18', id=82, color=[255, 255, 255], type='', swap=''), + 83: + dict( + name='Face_19', + id=83, + color=[255, 255, 255], + type='', + swap='Face_23'), + 84: + dict( + name='Face_20', + id=84, + color=[255, 255, 255], + type='', + swap='Face_22'), + 85: + dict(name='Face_21', id=85, color=[255, 255, 255], type='', swap=''), + 86: + dict( + name='Face_22', + id=86, + color=[255, 255, 255], + type='', + swap='Face_20'), + 87: + dict( + name='Face_23', + id=87, + color=[255, 255, 255], + type='', + swap='Face_19'), + 88: + dict( + name='Face_24', + id=88, + color=[255, 255, 255], + type='', + swap='Face_33'), + 89: + dict( + name='Face_25', + id=89, + color=[255, 255, 255], + type='', + swap='Face_32'), + 90: + dict( + name='Face_26', + id=90, + color=[255, 255, 255], + type='', + swap='Face_31'), + 91: + dict( + name='Face_27', + id=91, + color=[255, 255, 255], + type='', + swap='Face_30'), + 92: + dict( + name='Face_28', + id=92, + color=[255, 255, 255], + type='', + swap='Face_35'), + 93: + dict( + name='Face_29', + id=93, + color=[255, 255, 255], + type='', + swap='Face_34'), + 94: + dict( + name='Face_30', + id=94, + color=[255, 255, 255], + type='', + swap='Face_27'), + 95: + dict( + name='Face_31', + id=95, + color=[255, 255, 255], + type='', + swap='Face_26'), + 96: + dict( + name='Face_32', + id=96, + color=[255, 255, 255], + type='', + swap='Face_25'), + 97: + dict( + name='Face_33', + id=97, + color=[255, 255, 255], + type='', + swap='Face_24'), + 98: + dict( + name='Face_34', + id=98, + color=[255, 255, 255], + type='', + swap='Face_29'), + 99: + dict( + name='Face_35', + id=99, + color=[255, 255, 255], + type='', + swap='Face_28'), + 100: + dict( + name='Face_36', + id=100, + color=[255, 255, 255], + type='', + swap='Face_42'), + 101: + dict( + name='Face_37', + id=101, + color=[255, 255, 255], + type='', + swap='Face_41'), + 102: + dict( + name='Face_38', + id=102, + color=[255, 255, 255], + type='', + swap='Face_40'), + 103: + dict(name='Face_39', id=103, color=[255, 255, 255], type='', swap=''), + 104: + dict( + name='Face_40', + id=104, + color=[255, 255, 255], + type='', + swap='Face_38'), + 105: + dict( + name='Face_41', + id=105, + color=[255, 255, 255], + type='', + swap='Face_37'), + 106: + dict( + name='Face_42', + id=106, + color=[255, 255, 255], + type='', + swap='Face_36'), + 107: + dict( + name='Face_43', + id=107, + color=[255, 255, 255], + type='', + swap='Face_47'), + 108: + dict( + name='Face_44', + id=108, + color=[255, 255, 255], + type='', + swap='Face_46'), + 109: + dict(name='Face_45', id=109, color=[255, 255, 255], type='', swap=''), + 110: + dict( + name='Face_46', + id=110, + color=[255, 255, 255], + type='', + swap='Face_44'), + 111: + dict( + name='Face_47', + id=111, + color=[255, 255, 255], + type='', + swap='Face_43'), + 112: + dict( + name='Face_48', + id=112, + color=[255, 255, 255], + type='', + swap='Face_52'), + 113: + dict( + name='Face_49', + id=113, + color=[255, 255, 255], + type='', + swap='Face_51'), + 114: + dict(name='Face_50', id=114, color=[255, 255, 255], type='', swap=''), + 115: + dict( + name='Face_51', + id=115, + color=[255, 255, 255], + type='', + swap='Face_49'), + 116: + dict( + name='Face_52', + id=116, + color=[255, 255, 255], + type='', + swap='Face_48'), + 117: + dict( + name='Face_53', + id=117, + color=[255, 255, 255], + type='', + swap='Face_55'), + 118: + dict(name='Face_54', id=118, color=[255, 255, 255], type='', swap=''), + 119: + dict( + name='Face_55', + id=119, + color=[255, 255, 255], + type='', + swap='Face_53'), + 120: + dict( + name='Face_56', + id=120, + color=[255, 255, 255], + type='', + swap='Face_72'), + 121: + dict( + name='Face_57', + id=121, + color=[255, 255, 255], + type='', + swap='Face_71'), + 122: + dict( + name='Face_58', + id=122, + color=[255, 255, 255], + type='', + swap='Face_70'), + 123: + dict( + name='Face_59', + id=123, + color=[255, 255, 255], + type='', + swap='Face_69'), + 124: + dict( + name='Face_60', + id=124, + color=[255, 255, 255], + type='', + swap='Face_68'), + 125: + dict( + name='Face_61', + id=125, + color=[255, 255, 255], + type='', + swap='Face_67'), + 126: + dict( + name='Face_62', + id=126, + color=[255, 255, 255], + type='', + swap='Face_66'), + 127: + dict( + name='Face_63', + id=127, + color=[255, 255, 255], + type='', + swap='Face_65'), + 128: + dict(name='Face_64', id=128, color=[255, 255, 255], type='', swap=''), + 129: + dict( + name='Face_65', + id=129, + color=[255, 255, 255], + type='', + swap='Face_63'), + 130: + dict( + name='Face_66', + id=130, + color=[255, 255, 255], + type='', + swap='Face_62'), + 131: + dict( + name='Face_67', + id=131, + color=[255, 255, 255], + type='', + swap='Face_61'), + 132: + dict( + name='Face_68', + id=132, + color=[255, 255, 255], + type='', + swap='Face_60'), + 133: + dict( + name='Face_69', + id=133, + color=[255, 255, 255], + type='', + swap='Face_59'), + 134: + dict( + name='Face_70', + id=134, + color=[255, 255, 255], + type='', + swap='Face_58'), + 135: + dict( + name='Face_71', + id=135, + color=[255, 255, 255], + type='', + swap='Face_57'), + 136: + dict( + name='Face_72', + id=136, + color=[255, 255, 255], + type='', + swap='Face_56'), + }, + skeleton_info={ + 0: dict(link=('L_Ankle', 'L_Knee'), id=0, color=[0, 255, 0]), + 1: dict(link=('L_Knee', 'L_Hip'), id=1, color=[0, 255, 0]), + 2: dict(link=('R_Ankle', 'R_Knee'), id=2, color=[0, 255, 0]), + 3: dict(link=('R_Knee', 'R_Hip'), id=3, color=[0, 255, 0]), + 4: dict(link=('L_Hip', 'R_Hip'), id=4, color=[0, 255, 0]), + 5: dict(link=('L_Shoulder', 'L_Hip'), id=5, color=[0, 255, 0]), + 6: dict(link=('R_Shoulder', 'R_Hip'), id=6, color=[0, 255, 0]), + 7: dict(link=('L_Shoulder', 'R_Shoulder'), id=7, color=[0, 255, 0]), + 8: dict(link=('L_Shoulder', 'L_Elbow'), id=8, color=[0, 255, 0]), + 9: dict(link=('R_Shoulder', 'R_Elbow'), id=9, color=[0, 255, 0]), + 10: dict(link=('L_Elbow', 'L_Wrist'), id=10, color=[0, 255, 0]), + 11: dict(link=('R_Elbow', 'R_Wrist'), id=11, color=[255, 128, 0]), + 12: dict(link=('L_Eye', 'R_Eye'), id=12, color=[255, 128, 0]), + 13: dict(link=('Nose', 'L_Eye'), id=13, color=[255, 128, 0]), + 14: dict(link=('Nose', 'R_Eye'), id=14, color=[255, 128, 0]), + 15: dict(link=('L_Eye', 'L_Ear'), id=15, color=[255, 128, 0]), + 16: dict(link=('R_Eye', 'R_Ear'), id=16, color=[255, 128, 0]), + 17: dict(link=('L_Ear', 'L_Shoulder'), id=17, color=[255, 128, 0]), + 18: dict(link=('R_Ear', 'R_Shoulder'), id=18, color=[255, 128, 0]), + 19: dict(link=('L_Ankle', 'L_Big_toe'), id=19, color=[255, 128, 0]), + 20: dict(link=('L_Ankle', 'L_Small_toe'), id=20, color=[255, 128, 0]), + 21: dict(link=('L_Ankle', 'L_Heel'), id=21, color=[255, 128, 0]), + 22: dict(link=('R_Ankle', 'R_Big_toe'), id=22, color=[255, 128, 0]), + 23: dict(link=('R_Ankle', 'R_Small_toe'), id=23, color=[255, 128, 0]), + 24: dict(link=('R_Ankle', 'R_Heel'), id=24, color=[255, 128, 0]), + 25: dict(link=('L_Wrist', 'L_Thumb_1'), id=25, color=[255, 128, 0]), + 26: dict(link=('L_Thumb_1', 'L_Thumb_2'), id=26, color=[255, 128, 0]), + 27: dict(link=('L_Thumb_2', 'L_Thumb_3'), id=27, color=[255, 128, 0]), + 28: dict(link=('L_Thumb_3', 'L_Thumb_4'), id=28, color=[255, 128, 0]), + 29: dict(link=('L_Wrist', 'L_Index_1'), id=29, color=[255, 128, 0]), + 30: dict(link=('L_Index_1', 'L_Index_2'), id=30, color=[255, 128, 0]), + 31: + dict(link=('L_Index_2', 'L_Index_3'), id=31, color=[255, 255, 255]), + 32: + dict(link=('L_Index_3', 'L_Index_4'), id=32, color=[255, 255, 255]), + 33: dict(link=('L_Wrist', 'L_Middle_1'), id=33, color=[255, 255, 255]), + 34: + dict(link=('L_Middle_1', 'L_Middle_2'), id=34, color=[255, 255, 255]), + 35: + dict(link=('L_Middle_2', 'L_Middle_3'), id=35, color=[255, 255, 255]), + 36: + dict(link=('L_Middle_3', 'L_Middle_4'), id=36, color=[255, 255, 255]), + 37: dict(link=('L_Wrist', 'L_Ring_1'), id=37, color=[255, 255, 255]), + 38: dict(link=('L_Ring_1', 'L_Ring_2'), id=38, color=[255, 255, 255]), + 39: dict(link=('L_Ring_2', 'L_Ring_3'), id=39, color=[255, 255, 255]), + 40: dict(link=('L_Ring_3', 'L_Ring_4'), id=40, color=[255, 255, 255]), + 41: dict(link=('L_Wrist', 'L_Pinky_1'), id=41, color=[255, 255, 255]), + 42: + dict(link=('L_Pinky_1', 'L_Pinky_2'), id=42, color=[255, 255, 255]), + 43: + dict(link=('L_Pinky_2', 'L_Pinky_3'), id=43, color=[255, 255, 255]), + 44: + dict(link=('L_Pinky_3', 'L_Pinky_4'), id=44, color=[255, 255, 255]), + 45: dict(link=('R_Wrist', 'R_Thumb_1'), id=45, color=[255, 255, 255]), + 46: + dict(link=('R_Thumb_1', 'R_Thumb_2'), id=46, color=[255, 255, 255]), + 47: + dict(link=('R_Thumb_2', 'R_Thumb_3'), id=47, color=[255, 255, 255]), + 48: + dict(link=('R_Thumb_3', 'R_Thumb_4'), id=48, color=[255, 255, 255]), + 49: dict(link=('R_Wrist', 'R_Index_1'), id=49, color=[255, 255, 255]), + 50: + dict(link=('R_Index_1', 'R_Index_2'), id=50, color=[255, 255, 255]), + 51: + dict(link=('R_Index_2', 'R_Index_3'), id=51, color=[255, 255, 255]), + 52: + dict(link=('R_Index_3', 'R_Index_4'), id=52, color=[255, 255, 255]), + 53: dict(link=('R_Wrist', 'R_Middle_1'), id=53, color=[255, 255, 255]), + 54: + dict(link=('R_Middle_1', 'R_Middle_2'), id=54, color=[255, 255, 255]), + 55: + dict(link=('R_Middle_2', 'R_Middle_3'), id=55, color=[255, 255, 255]), + 56: + dict(link=('R_Middle_3', 'R_Middle_4'), id=56, color=[255, 255, 255]), + 57: dict(link=('R_Wrist', 'R_Pinky_1'), id=57, color=[255, 255, 255]), + 58: + dict(link=('R_Pinky_1', 'R_Pinky_2'), id=58, color=[255, 255, 255]), + 59: + dict(link=('R_Pinky_2', 'R_Pinky_3'), id=59, color=[255, 255, 255]), + 60: + dict(link=('R_Pinky_3', 'R_Pinky_4'), id=60, color=[255, 255, 255]), + }, + joint_weights=[1.] * 137, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/wflw.py b/modules/rtmpose/configs/_base_/datasets/wflw.py new file mode 100644 index 0000000..c31750b --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/wflw.py @@ -0,0 +1,192 @@ +dataset_info = dict( + dataset_name='wflw', + paper_info=dict( + author='Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, ' + 'Quan and Cai, Yici and Zhou, Qiang', + title='Look at boundary: A boundary-aware face alignment algorithm', + container='Proceedings of the IEEE conference on computer ' + 'vision and pattern recognition', + year='2018', + homepage='https://wywu.github.io/projects/LAB/WFLW.html', + ), + keypoint_info={ + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-32'), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-31'), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-30'), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-29'), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-28'), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-27'), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-26'), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-25'), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-24'), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-23'), + 10: + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-22'), + 11: + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-21'), + 12: + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-20'), + 13: + dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-19'), + 14: + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-18'), + 15: + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-17'), + 16: dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''), + 17: + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-15'), + 18: + dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-14'), + 19: + dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-13'), + 20: + dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-12'), + 21: + dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-11'), + 22: + dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-10'), + 23: + dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-9'), + 24: + dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-8'), + 25: + dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-7'), + 26: + dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-6'), + 27: + dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap='kpt-5'), + 28: + dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap='kpt-4'), + 29: + dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap='kpt-3'), + 30: + dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap='kpt-2'), + 31: + dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-1'), + 32: + dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-0'), + 33: + dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap='kpt-46'), + 34: + dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-45'), + 35: + dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-44'), + 36: + dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-43'), + 37: dict( + name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-42'), + 38: dict( + name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-50'), + 39: dict( + name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-49'), + 40: dict( + name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-48'), + 41: dict( + name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-47'), + 42: dict( + name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-37'), + 43: dict( + name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-36'), + 44: dict( + name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-35'), + 45: dict( + name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-34'), + 46: dict( + name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-33'), + 47: dict( + name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-41'), + 48: dict( + name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-40'), + 49: dict( + name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-39'), + 50: dict( + name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-38'), + 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''), + 52: dict(name='kpt-52', id=52, color=[255, 0, 0], type='', swap=''), + 53: dict(name='kpt-53', id=53, color=[255, 0, 0], type='', swap=''), + 54: dict(name='kpt-54', id=54, color=[255, 0, 0], type='', swap=''), + 55: dict( + name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-59'), + 56: dict( + name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-58'), + 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''), + 58: dict( + name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-56'), + 59: dict( + name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-55'), + 60: dict( + name='kpt-60', id=60, color=[255, 0, 0], type='', swap='kpt-72'), + 61: dict( + name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-71'), + 62: dict( + name='kpt-62', id=62, color=[255, 0, 0], type='', swap='kpt-70'), + 63: dict( + name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-69'), + 64: dict( + name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-68'), + 65: dict( + name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-75'), + 66: dict( + name='kpt-66', id=66, color=[255, 0, 0], type='', swap='kpt-74'), + 67: dict( + name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-73'), + 68: dict( + name='kpt-68', id=68, color=[255, 0, 0], type='', swap='kpt-64'), + 69: dict( + name='kpt-69', id=69, color=[255, 0, 0], type='', swap='kpt-63'), + 70: dict( + name='kpt-70', id=70, color=[255, 0, 0], type='', swap='kpt-62'), + 71: dict( + name='kpt-71', id=71, color=[255, 0, 0], type='', swap='kpt-61'), + 72: dict( + name='kpt-72', id=72, color=[255, 0, 0], type='', swap='kpt-60'), + 73: dict( + name='kpt-73', id=73, color=[255, 0, 0], type='', swap='kpt-67'), + 74: dict( + name='kpt-74', id=74, color=[255, 0, 0], type='', swap='kpt-66'), + 75: dict( + name='kpt-75', id=75, color=[255, 0, 0], type='', swap='kpt-65'), + 76: dict( + name='kpt-76', id=76, color=[255, 0, 0], type='', swap='kpt-82'), + 77: dict( + name='kpt-77', id=77, color=[255, 0, 0], type='', swap='kpt-81'), + 78: dict( + name='kpt-78', id=78, color=[255, 0, 0], type='', swap='kpt-80'), + 79: dict(name='kpt-79', id=79, color=[255, 0, 0], type='', swap=''), + 80: dict( + name='kpt-80', id=80, color=[255, 0, 0], type='', swap='kpt-78'), + 81: dict( + name='kpt-81', id=81, color=[255, 0, 0], type='', swap='kpt-77'), + 82: dict( + name='kpt-82', id=82, color=[255, 0, 0], type='', swap='kpt-76'), + 83: dict( + name='kpt-83', id=83, color=[255, 0, 0], type='', swap='kpt-87'), + 84: dict( + name='kpt-84', id=84, color=[255, 0, 0], type='', swap='kpt-86'), + 85: dict(name='kpt-85', id=85, color=[255, 0, 0], type='', swap=''), + 86: dict( + name='kpt-86', id=86, color=[255, 0, 0], type='', swap='kpt-84'), + 87: dict( + name='kpt-87', id=87, color=[255, 0, 0], type='', swap='kpt-83'), + 88: dict( + name='kpt-88', id=88, color=[255, 0, 0], type='', swap='kpt-92'), + 89: dict( + name='kpt-89', id=89, color=[255, 0, 0], type='', swap='kpt-91'), + 90: dict(name='kpt-90', id=90, color=[255, 0, 0], type='', swap=''), + 91: dict( + name='kpt-91', id=91, color=[255, 0, 0], type='', swap='kpt-89'), + 92: dict( + name='kpt-92', id=92, color=[255, 0, 0], type='', swap='kpt-88'), + 93: dict( + name='kpt-93', id=93, color=[255, 0, 0], type='', swap='kpt-95'), + 94: dict(name='kpt-94', id=94, color=[255, 0, 0], type='', swap=''), + 95: dict( + name='kpt-95', id=95, color=[255, 0, 0], type='', swap='kpt-93'), + 96: dict( + name='kpt-96', id=96, color=[255, 0, 0], type='', swap='kpt-97'), + 97: dict( + name='kpt-97', id=97, color=[255, 0, 0], type='', swap='kpt-96') + }, + skeleton_info={}, + joint_weights=[1.] * 98, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/datasets/zebra.py b/modules/rtmpose/configs/_base_/datasets/zebra.py new file mode 100644 index 0000000..bc4f9ec --- /dev/null +++ b/modules/rtmpose/configs/_base_/datasets/zebra.py @@ -0,0 +1,64 @@ +dataset_info = dict( + dataset_name='zebra', + paper_info=dict( + author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and ' + 'Li, Liang and Koger, Benjamin and Costelloe, Blair R and ' + 'Couzin, Iain D', + title='DeepPoseKit, a software toolkit for fast and robust ' + 'animal pose estimation using deep learning', + container='Elife', + year='2019', + homepage='https://github.com/jgraving/DeepPoseKit-Data', + ), + keypoint_info={ + 0: + dict(name='snout', id=0, color=[255, 255, 255], type='', swap=''), + 1: + dict(name='head', id=1, color=[255, 255, 255], type='', swap=''), + 2: + dict(name='neck', id=2, color=[255, 255, 255], type='', swap=''), + 3: + dict( + name='forelegL1', + id=3, + color=[255, 255, 255], + type='', + swap='forelegR1'), + 4: + dict( + name='forelegR1', + id=4, + color=[255, 255, 255], + type='', + swap='forelegL1'), + 5: + dict( + name='hindlegL1', + id=5, + color=[255, 255, 255], + type='', + swap='hindlegR1'), + 6: + dict( + name='hindlegR1', + id=6, + color=[255, 255, 255], + type='', + swap='hindlegL1'), + 7: + dict(name='tailbase', id=7, color=[255, 255, 255], type='', swap=''), + 8: + dict(name='tailtip', id=8, color=[255, 255, 255], type='', swap='') + }, + skeleton_info={ + 0: dict(link=('head', 'snout'), id=0, color=[255, 255, 255]), + 1: dict(link=('neck', 'head'), id=1, color=[255, 255, 255]), + 2: dict(link=('forelegL1', 'neck'), id=2, color=[255, 255, 255]), + 3: dict(link=('forelegR1', 'neck'), id=3, color=[255, 255, 255]), + 4: dict(link=('hindlegL1', 'tailbase'), id=4, color=[255, 255, 255]), + 5: dict(link=('hindlegR1', 'tailbase'), id=5, color=[255, 255, 255]), + 6: dict(link=('tailbase', 'neck'), id=6, color=[255, 255, 255]), + 7: dict(link=('tailtip', 'tailbase'), id=7, color=[255, 255, 255]) + }, + joint_weights=[1.] * 9, + sigmas=[]) diff --git a/modules/rtmpose/configs/_base_/default_runtime.py b/modules/rtmpose/configs/_base_/default_runtime.py new file mode 100644 index 0000000..c83ae93 --- /dev/null +++ b/modules/rtmpose/configs/_base_/default_runtime.py @@ -0,0 +1,54 @@ +default_scope = 'mmpose' + +# hooks +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=10), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='PoseVisualizationHook', enable=False), + badcase=dict( + type='BadCaseAnalysisHook', + enable=False, + out_dir='badcase', + metric_type='loss', + badcase_thr=5)) + +# custom hooks +custom_hooks = [ + # Synchronize model buffers such as running_mean and running_var in BN + # at the end of each epoch + dict(type='SyncBuffersHook') +] + +# multi-processing backend +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +# visualizer +vis_backends = [ + dict(type='LocalVisBackend'), + # dict(type='TensorboardVisBackend'), + # dict(type='WandbVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# logger +log_processor = dict( + type='LogProcessor', window_size=50, by_epoch=True, num_digits=6) +log_level = 'INFO' +load_from = None +resume = False + +# file I/O backend +backend_args = dict(backend='local') + +# training/validation/testing progress +train_cfg = dict(by_epoch=True) +val_cfg = dict() +test_cfg = dict() diff --git a/modules/rtmpose/configs/animal_2d_keypoint/README.md b/modules/rtmpose/configs/animal_2d_keypoint/README.md new file mode 100644 index 0000000..1ee3b40 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/README.md @@ -0,0 +1,20 @@ +# 2D Animal Keypoint Detection + +2D animal keypoint detection (animal pose estimation) aims to detect the key-point of different species, including rats, +dogs, macaques, and cheetah. It provides detailed behavioral analysis for neuroscience, medical and ecology applications. + +## Data preparation + +Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_animal_keypoint.md) to prepare data. + +## Demo + +Please follow [DEMO](/demo/docs/en/2d_animal_demo.md) to generate fancy demos. + +
+ +
+ +
+ +
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000..b722d40 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/README.md @@ -0,0 +1,16 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### AP-10K Dataset + +Results on AP-10K validation set + +| Model | Input Size | AP | Details and Download | +| :-------: | :--------: | :---: | :------------------------------------------: | +| RTMPose-m | 256x256 | 0.722 | [rtmpose_cp10k.md](./ap10k/rtmpose_ap10k.md) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py new file mode 100644 index 0000000..576b71f --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py @@ -0,0 +1,245 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'AP10KDataset' +data_mode = 'topdown' +data_root = 'data/ap10k/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/', +# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-train-split1.json', + data_prefix=dict(img='data/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-val-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-test-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-val-split1.json') +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.md b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.md new file mode 100644 index 0000000..e72cefb --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.md @@ -0,0 +1,25 @@ + + + + +
+AP-10K (NeurIPS'2021) + +```bibtex +@misc{yu2021ap10k, + title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild}, + author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao}, + year={2021}, + eprint={2108.12617}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ +Results on AP-10K validation set + +| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log | +| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: | +| [rtmpose-m](/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.722 | 0.939 | 0.788 | 0.569 | 0.728 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.json) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.yml b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.yml new file mode 100644 index 0000000..1f12576 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.yml @@ -0,0 +1,19 @@ +Models: +- Config: configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py + In Collection: RTMPose + Alias: animal + Metadata: + Architecture: + - RTMPose + Training Data: AP-10K + Name: rtmpose-m_8xb64-210e_ap10k-256x256 + Results: + - Dataset: AP-10K + Metrics: + AP: 0.722 + AP@0.5: 0.939 + AP@0.75: 0.788 + AP (L): 0.728 + AP (M): 0.569 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.pth diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/README.md new file mode 100644 index 0000000..bf13310 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/README.md @@ -0,0 +1,68 @@ +# Top-down heatmap-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the +likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html). + +
+ +
+ +## Results and Models + +### Animal-Pose Dataset + +Results on AnimalPose validation set (1117 instances) + +| Model | Input Size | AP | AR | Details and Download | +| :--------: | :--------: | :---: | :---: | :-------------------------------------------------------: | +| HRNet-w32 | 256x256 | 0.740 | 0.780 | [hrnet_animalpose.md](./animalpose/hrnet_animalpose.md) | +| HRNet-w48 | 256x256 | 0.738 | 0.778 | [hrnet_animalpose.md](./animalpose/hrnet_animalpose.md) | +| ResNet-152 | 256x256 | 0.704 | 0.748 | [resnet_animalpose.md](./animalpose/resnet_animalpose.md) | +| ResNet-101 | 256x256 | 0.696 | 0.736 | [resnet_animalpose.md](./animalpose/resnet_animalpose.md) | +| ResNet-50 | 256x256 | 0.691 | 0.736 | [resnet_animalpose.md](./animalpose/resnet_animalpose.md) | + +### AP-10K Dataset + +Results on AP-10K validation set + +| Model | Input Size | AP | Details and Download | +| :--------: | :--------: | :---: | :--------------------------------------------------: | +| HRNet-w48 | 256x256 | 0.728 | [hrnet_ap10k.md](./ap10k/hrnet_ap10k.md) | +| HRNet-w32 | 256x256 | 0.722 | [hrnet_ap10k.md](./ap10k/hrnet_ap10k.md) | +| ResNet-101 | 256x256 | 0.681 | [resnet_ap10k.md](./ap10k/resnet_ap10k.md) | +| ResNet-50 | 256x256 | 0.680 | [resnet_ap10k.md](./ap10k/resnet_ap10k.md) | +| CSPNeXt-m | 256x256 | 0.703 | [cspnext_udp_ap10k.md](./ap10k/cspnext_udp_ap10k.md) | + +### Desert Locust Dataset + +Results on Desert Locust test set + +| Model | Input Size | AUC | EPE | Details and Download | +| :--------: | :--------: | :---: | :--: | :-------------------------------------------: | +| ResNet-152 | 160x160 | 0.925 | 1.49 | [resnet_locust.md](./locust/resnet_locust.md) | +| ResNet-101 | 160x160 | 0.907 | 2.03 | [resnet_locust.md](./locust/resnet_locust.md) | +| ResNet-50 | 160x160 | 0.900 | 2.27 | [resnet_locust.md](./locust/resnet_locust.md) | + +### Grévy’s Zebra Dataset + +Results on Grévy’s Zebra test set + +| Model | Input Size | AUC | EPE | Details and Download | +| :--------: | :--------: | :---: | :--: | :----------------------------------------: | +| ResNet-152 | 160x160 | 0.921 | 1.67 | [resnet_zebra.md](./zebra/resnet_zebra.md) | +| ResNet-101 | 160x160 | 0.915 | 1.83 | [resnet_zebra.md](./zebra/resnet_zebra.md) | +| ResNet-50 | 160x160 | 0.914 | 1.87 | [resnet_zebra.md](./zebra/resnet_zebra.md) | + +### Animal-Kingdom Dataset + +Results on AnimalKingdom test set + +| Model | Input Size | class | PCK(0.05) | Details and Download | +| :-------: | :--------: | :-----------: | :-------: | :---------------------------------------------------: | +| HRNet-w32 | 256x256 | P1 | 0.6323 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) | +| HRNet-w32 | 256x256 | P2 | 0.3741 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) | +| HRNet-w32 | 256x256 | P3_mammals | 0.571 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) | +| HRNet-w32 | 256x256 | P3_amphibians | 0.5358 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) | +| HRNet-w32 | 256x256 | P3_reptiles | 0.51 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) | +| HRNet-w32 | 256x256 | P3_birds | 0.7671 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) | +| HRNet-w32 | 256x256 | P3_fishes | 0.6406 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.md new file mode 100644 index 0000000..a2ba7cb --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.md @@ -0,0 +1,47 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+AnimalKingdom (CVPR'2022) + +```bibtex +@InProceedings{ + Ng_2022_CVPR, + author = {Ng, Xun Long and Ong, Kian Eng and Zheng, Qichen and Ni, Yun and Yeo, Si Yong and Liu, Jun}, + title = {Animal Kingdom: A Large and Diverse Dataset for Animal Behavior Understanding}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2022}, + pages = {19023-19034} + } +``` + +
+ +Results on AnimalKingdom validation set + +| Arch | Input Size | PCK(0.05) | Official Repo | Paper | ckpt | log | +| ------------------------------------------------------ | ---------- | --------- | ------------- | ------ | ------------------------------------------------------ | ------------------------------------------------------ | +| [P1_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py) | 256x256 | 0.6323 | 0.6342 | 0.6606 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256-08bf96cb_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256-08bf96cb_20230519.json) | +| [P2_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py) | 256x256 | 0.3741 | 0.3726 | 0.393 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256-2396cc58_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256-2396cc58_20230519.json) | +| [P3_mammals_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py) | 256x256 | 0.571 | 0.5719 | 0.6159 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256-e8aadf02_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256-e8aadf02_20230519.json) | +| [P3_amphibians_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py) | 256x256 | 0.5358 | 0.5432 | 0.5674 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256-845085f9_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256-845085f9_20230519.json) | +| [P3_reptiles_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py) | 256x256 | 0.51 | 0.5 | 0.5606 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256-e8440c16_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256-e8440c16_20230519.json) | +| [P3_birds_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py) | 256x256 | 0.7671 | 0.7636 | 0.7735 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256-566feff5_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256-566feff5_20230519.json) | +| [P3_fishes_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py) | 256x256 | 0.6406 | 0.636 | 0.6825 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256-76c3999f_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256-76c3999f_20230519.json) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.yml new file mode 100644 index 0000000..b560cd8 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.yml @@ -0,0 +1,86 @@ +Models: +- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py + In Collection: HRNet + Metadata: + Architecture: &id001 + - HRNet + Training Data: AnimalKingdom_P1 + Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256 + Results: + - Dataset: AnimalKingdom + Metrics: + PCK: 0.6323 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256-08bf96cb_20230519.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: AnimalKingdom_P2 + Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256 + Results: + - Dataset: AnimalKingdom + Metrics: + PCK: 0.3741 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256-2396cc58_20230519.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: AnimalKingdom_P3_amphibian + Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256 + Results: + - Dataset: AnimalKingdom + Metrics: + PCK: 0.5358 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256-845085f9_20230519.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: AnimalKingdom_P3_bird + Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256 + Results: + - Dataset: AnimalKingdom + Metrics: + PCK: 0.7671 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256-566feff5_20230519.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: AnimalKingdom_P3_fish + Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256 + Results: + - Dataset: AnimalKingdom + Metrics: + PCK: 0.6406 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256-76c3999f_20230519.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: AnimalKingdom_P3_mammal + Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256 + Results: + - Dataset: AnimalKingdom + Metrics: + PCK: 0.571 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256-e8aadf02_20230519.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: AnimalKingdom_P3_reptile + Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256 + Results: + - Dataset: AnimalKingdom + Metrics: + PCK: 0.51 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256-e8440c16_20230519.pth diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py new file mode 100644 index 0000000..9af6952 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py @@ -0,0 +1,146 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='AdamW', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=23, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalKingdomDataset' +data_mode = 'topdown' +data_root = 'data/ak/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P1/train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=24, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P1/test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py new file mode 100644 index 0000000..d7f4238 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py @@ -0,0 +1,146 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='AdamW', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=23, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalKingdomDataset' +data_mode = 'topdown' +data_root = 'data/ak/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P2/train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=24, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P2/test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py new file mode 100644 index 0000000..1b54bb7 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py @@ -0,0 +1,146 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='AdamW', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=23, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalKingdomDataset' +data_mode = 'topdown' +data_root = 'data/ak/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_amphibian/train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=24, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_amphibian/test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py new file mode 100644 index 0000000..a3e8d9e --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py @@ -0,0 +1,146 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='AdamW', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=23, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalKingdomDataset' +data_mode = 'topdown' +data_root = 'data/ak/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_bird/train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=24, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_bird/test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py new file mode 100644 index 0000000..839e7f9 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py @@ -0,0 +1,146 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='AdamW', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=23, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalKingdomDataset' +data_mode = 'topdown' +data_root = 'data/ak/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_fish/train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=24, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_fish/test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py new file mode 100644 index 0000000..a367693 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py @@ -0,0 +1,146 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='AdamW', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=23, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalKingdomDataset' +data_mode = 'topdown' +data_root = 'data/ak/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_mammal/train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=24, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_mammal/test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py new file mode 100644 index 0000000..8d2c0d7 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py @@ -0,0 +1,146 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='AdamW', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=23, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalKingdomDataset' +data_mode = 'topdown' +data_root = 'data/ak/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_reptile/train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=24, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ak_P3_reptile/test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.md new file mode 100644 index 0000000..bd17ee5 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.md @@ -0,0 +1,40 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+Animal-Pose (ICCV'2019) + +```bibtex +@InProceedings{Cao_2019_ICCV, + author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing}, + title = {Cross-Domain Adaptation for Animal Pose Estimation}, + booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2019} +} +``` + +
+ +Results on AnimalPose validation set (1117 instances) + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py) | 256x256 | 0.740 | 0.959 | 0.833 | 0.780 | 0.965 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256_20210426.log.json) | +| [pose_hrnet_w48](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py) | 256x256 | 0.738 | 0.958 | 0.831 | 0.778 | 0.962 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_animalpose_256x256-34644726_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_animalpose_256x256_20210426.log.json) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.yml new file mode 100644 index 0000000..cb03cec --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.yml @@ -0,0 +1,34 @@ +Models: +- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py + In Collection: HRNet + Metadata: + Architecture: &id001 + - HRNet + Training Data: Animal-Pose + Name: td-hm_hrnet-w32_8xb64-210e_animalpose-256x256 + Results: + - Dataset: Animal-Pose + Metrics: + AP: 0.740 + AP@0.5: 0.959 + AP@0.75: 0.833 + AR: 0.780 + AR@0.5: 0.965 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: Animal-Pose + Name: td-hm_hrnet-w48_8xb64-210e_animalpose-256x256 + Results: + - Dataset: Animal-Pose + Metrics: + AP: 0.738 + AP@0.5: 0.958 + AP@0.75: 0.831 + AR: 0.778 + AR@0.5: 0.962 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_animalpose_256x256-34644726_20210426.pth diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.md new file mode 100644 index 0000000..85d3912 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.md @@ -0,0 +1,41 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+Animal-Pose (ICCV'2019) + +```bibtex +@InProceedings{Cao_2019_ICCV, + author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing}, + title = {Cross-Domain Adaptation for Animal Pose Estimation}, + booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2019} +} +``` + +
+ +Results on AnimalPose validation set (1117 instances) + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_resnet_50](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py) | 256x256 | 0.691 | 0.947 | 0.770 | 0.736 | 0.955 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_animalpose_256x256-e1f30bff_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_animalpose_256x256_20210426.log.json) | +| [pose_resnet_101](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py) | 256x256 | 0.696 | 0.948 | 0.774 | 0.736 | 0.951 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_animalpose_256x256-85563f4a_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_animalpose_256x256_20210426.log.json) | +| [pose_resnet_152](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py) | 256x256 | 0.704 | 0.938 | 0.786 | 0.748 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_animalpose_256x256-a0a7506c_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_animalpose_256x256_20210426.log.json) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.yml new file mode 100644 index 0000000..2888981 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.yml @@ -0,0 +1,51 @@ +Models: +- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: Animal-Pose + Name: td-hm_res50_8xb64-210e_animalpose-256x256 + Results: + - Dataset: Animal-Pose + Metrics: + AP: 0.691 + AP@0.5: 0.947 + AP@0.75: 0.770 + AR: 0.736 + AR@0.5: 0.955 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_animalpose_256x256-e1f30bff_20210426.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: Animal-Pose + Name: td-hm_res101_8xb64-210e_animalpose-256x256 + Results: + - Dataset: Animal-Pose + Metrics: + AP: 0.696 + AP@0.5: 0.948 + AP@0.75: 0.774 + AR: 0.736 + AR@0.5: 0.951 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_animalpose_256x256-85563f4a_20210426.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: Animal-Pose + Name: td-hm_res152_8xb32-210e_animalpose-256x256 + Results: + - Dataset: Animal-Pose + Metrics: + AP: 0.704 + AP@0.5: 0.938 + AP@0.75: 0.786 + AR: 0.748 + AR@0.5: 0.946 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_animalpose_256x256-a0a7506c_20210426.pth diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py new file mode 100644 index 0000000..9dc501d --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py @@ -0,0 +1,147 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=20, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalPoseDataset' +data_mode = 'topdown' +data_root = 'data/animalpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_val.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py new file mode 100644 index 0000000..d671f61 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py @@ -0,0 +1,147 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=20, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalPoseDataset' +data_mode = 'topdown' +data_root = 'data/animalpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_val.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py new file mode 100644 index 0000000..abc4127 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py @@ -0,0 +1,118 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=20, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalPoseDataset' +data_mode = 'topdown' +data_root = 'data/animalpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_val.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py new file mode 100644 index 0000000..9e8b4bf --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py @@ -0,0 +1,118 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=20, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalPoseDataset' +data_mode = 'topdown' +data_root = 'data/animalpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_val.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py new file mode 100644 index 0000000..953b9e9 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py @@ -0,0 +1,118 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=20, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AnimalPoseDataset' +data_mode = 'topdown' +data_root = 'data/animalpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/animalpose_val.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py new file mode 100644 index 0000000..66391d2 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py @@ -0,0 +1,220 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'AP10KDataset' +data_mode = 'topdown' +data_root = 'data/ap10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-train-split1.json', + data_prefix=dict(img='data/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-val-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-test-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-val-split1.json') +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.md new file mode 100644 index 0000000..8d1c8d2 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.md @@ -0,0 +1,58 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+AP-10K (NeurIPS'2021) + +```bibtex +@misc{yu2021ap10k, + title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild}, + author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao}, + year={2021}, + eprint={2108.12617}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ +Results on AP-10K validation set + +| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log | +| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: | +| [pose_cspnext_m](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.703 | 0.944 | 0.776 | 0.513 | 0.710 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-ap10k_pt-in1k_210e-256x256-1f2d947a_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-ap10k_pt-in1k_210e-256x256-1f2d947a_20230123.json) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.yml new file mode 100644 index 0000000..da5785c --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.yml @@ -0,0 +1,19 @@ +Models: +- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py + In Collection: UDP + Metadata: + Architecture: &id001 + - UDP + - HRNet + Training Data: AP-10K + Name: cspnext-m_udp_8xb64-210e_ap10k-256x256 + Results: + - Dataset: AP-10K + Metrics: + AP: 0.703 + AP@0.5: 0.944 + AP@0.75: 0.776 + AP (L): 0.71 + AP (M): 0.513 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-ap10k_pt-in1k_210e-256x256-1f2d947a_20230123.pth diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.md new file mode 100644 index 0000000..639509d --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.md @@ -0,0 +1,41 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+AP-10K (NeurIPS'2021) + +```bibtex +@misc{yu2021ap10k, + title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild}, + author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao}, + year={2021}, + eprint={2108.12617}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ +Results on AP-10K validation set + +| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log | +| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: | +| [pose_hrnet_w32](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.722 | 0.935 | 0.789 | 0.557 | 0.729 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.log.json) | +| [pose_hrnet_w48](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.728 | 0.936 | 0.802 | 0.577 | 0.735 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_ap10k_256x256-d95ab412_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_ap10k_256x256-d95ab412_20211029.log.json) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.yml new file mode 100644 index 0000000..f485dcb --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.yml @@ -0,0 +1,34 @@ +Models: +- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py + In Collection: HRNet + Metadata: + Architecture: &id001 + - HRNet + Training Data: AP-10K + Name: td-hm_hrnet-w32_8xb64-210e_ap10k-256x256 + Results: + - Dataset: AP-10K + Metrics: + AP: 0.722 + AP@0.5: 0.935 + AP@0.75: 0.789 + AP (L): 0.729 + AP (M): 0.557 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: AP-10K + Name: td-hm_hrnet-w48_8xb64-210e_ap10k-256x256 + Results: + - Dataset: AP-10K + Metrics: + AP: 0.728 + AP@0.5: 0.936 + AP@0.75: 0.802 + AP (L): 0.735 + AP (M): 0.577 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_ap10k_256x256-d95ab412_20211029.pth diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.md new file mode 100644 index 0000000..7dcd2e3 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.md @@ -0,0 +1,41 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+AP-10K (NeurIPS'2021) + +```bibtex +@misc{yu2021ap10k, + title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild}, + author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao}, + year={2021}, + eprint={2108.12617}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ +Results on AP-10K validation set + +| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log | +| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: | +| [pose_resnet_50](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.680 | 0.926 | 0.738 | 0.552 | 0.687 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.log.json) | +| [pose_resnet_101](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.681 | 0.921 | 0.751 | 0.545 | 0.690 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.log.json) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml new file mode 100644 index 0000000..29d5b6e --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml @@ -0,0 +1,35 @@ +Models: +- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: AP-10K + Name: td-hm_res50_8xb64-210e_ap10k-256x256 + Results: + - Dataset: AP-10K + Metrics: + AP: 0.680 + AP@0.5: 0.926 + AP@0.75: 0.738 + AP (L): 0.687 + AP (M): 0.552 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: AP-10K + Name: td-hm_res101_8xb64-210e_ap10k-256x256 + Results: + - Dataset: AP-10K + Metrics: + AP: 0.681 + AP@0.5: 0.921 + AP@0.75: 0.751 + AP (L): 0.690 + AP (M): 0.545 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.pth diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py new file mode 100644 index 0000000..9d661cc --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py @@ -0,0 +1,164 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AP10KDataset' +data_mode = 'topdown' +data_root = 'data/ap10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-train-split1.json', + data_prefix=dict(img='data/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-val-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-test-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-val-split1.json') +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py new file mode 100644 index 0000000..fe28073 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py @@ -0,0 +1,164 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AP10KDataset' +data_mode = 'topdown' +data_root = 'data/ap10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-train-split1.json', + data_prefix=dict(img='data/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-val-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-test-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-val-split1.json') +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py new file mode 100644 index 0000000..7c1739d --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py @@ -0,0 +1,135 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AP10KDataset' +data_mode = 'topdown' +data_root = 'data/ap10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-train-split1.json', + data_prefix=dict(img='data/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-val-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-test-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-val-split1.json') +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py new file mode 100644 index 0000000..703470f --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py @@ -0,0 +1,135 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AP10KDataset' +data_mode = 'topdown' +data_root = 'data/ap10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-train-split1.json', + data_prefix=dict(img='data/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-val-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-test-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-val-split1.json') +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.md new file mode 100644 index 0000000..ac07e5a --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.md @@ -0,0 +1,43 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+Desert Locust (Elife'2019) + +```bibtex +@article{graving2019deepposekit, + title={DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning}, + author={Graving, Jacob M and Chae, Daniel and Naik, Hemal and Li, Liang and Koger, Benjamin and Costelloe, Blair R and Couzin, Iain D}, + journal={Elife}, + volume={8}, + pages={e47994}, + year={2019}, + publisher={eLife Sciences Publications Limited} +} +``` + +
+ +Results on Desert Locust test set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_resnet_50](/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py) | 160x160 | 1.000 | 0.900 | 2.27 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_locust_160x160-9efca22b_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_locust_160x160_20210407.log.json) | +| [pose_resnet_101](/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py) | 160x160 | 1.000 | 0.907 | 2.03 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_locust_160x160-d77986b3_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_locust_160x160_20210407.log.json) | +| [pose_resnet_152](/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py) | 160x160 | 1.000 | 0.925 | 1.49 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_locust_160x160-4ea9b372_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_locust_160x160_20210407.log.json) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.yml new file mode 100644 index 0000000..e05b37d --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.yml @@ -0,0 +1,45 @@ +Models: +- Config: configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: Desert Locust + Name: td-hm_res50_8xb64-210e_locust-160x160 + Results: + - Dataset: Desert Locust + Metrics: + AUC: 0.9 + EPE: 2.27 + PCK@0.2: 1 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_locust_160x160-9efca22b_20210407.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: Desert Locust + Name: td-hm_res101_8xb64-210e_locust-160x160 + Results: + - Dataset: Desert Locust + Metrics: + AUC: 0.907 + EPE: 2.03 + PCK@0.2: 1 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_locust_160x160-d77986b3_20210407.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: Desert Locust + Name: td-hm_res152_8xb32-210e_locust-160x160 + Results: + - Dataset: Desert Locust + Metrics: + AUC: 0.925 + EPE: 1.49 + PCK@0.2: 1.0 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_locust_160x160-4ea9b372_20210407.pth diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py new file mode 100644 index 0000000..7881648 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=35, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'LocustDataset' +data_mode = 'topdown' +data_root = 'data/locust/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_factor=0.25, + rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/locust_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/locust_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py new file mode 100644 index 0000000..c7bdca7 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=35, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'LocustDataset' +data_mode = 'topdown' +data_root = 'data/locust/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_factor=0.25, + rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/locust_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/locust_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py new file mode 100644 index 0000000..309af14 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=35, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'LocustDataset' +data_mode = 'topdown' +data_root = 'data/locust/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_factor=0.25, + rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/locust_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/locust_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.md new file mode 100644 index 0000000..c49c11e --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.md @@ -0,0 +1,43 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+Grévy’s Zebra (Elife'2019) + +```bibtex +@article{graving2019deepposekit, + title={DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning}, + author={Graving, Jacob M and Chae, Daniel and Naik, Hemal and Li, Liang and Koger, Benjamin and Costelloe, Blair R and Couzin, Iain D}, + journal={Elife}, + volume={8}, + pages={e47994}, + year={2019}, + publisher={eLife Sciences Publications Limited} +} +``` + +
+ +Results on Grévy’s Zebra test set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_resnet_50](/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py) | 160x160 | 1.000 | 0.914 | 1.87 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_zebra_160x160-5a104833_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_zebra_160x160_20210407.log.json) | +| [pose_resnet_101](/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py) | 160x160 | 1.000 | 0.915 | 1.83 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_zebra_160x160-e8cb2010_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_zebra_160x160_20210407.log.json) | +| [pose_resnet_152](/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py) | 160x160 | 1.000 | 0.921 | 1.67 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_zebra_160x160-05de71dd_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_zebra_160x160_20210407.log.json) | diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.yml new file mode 100644 index 0000000..68ffbe7 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.yml @@ -0,0 +1,45 @@ +Models: +- Config: configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: "Gr\xE9vy\u2019s Zebra" + Name: td-hm_res50_8xb64-210e_zebra-160x160 + Results: + - Dataset: "Gr\xE9vy\u2019s Zebra" + Metrics: + AUC: 0.914 + EPE: 1.87 + PCK@0.2: 1.0 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_zebra_160x160-5a104833_20210407.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: "Gr\xE9vy\u2019s Zebra" + Name: td-hm_res101_8xb64-210e_zebra-160x160 + Results: + - Dataset: "Gr\xE9vy\u2019s Zebra" + Metrics: + AUC: 0.915 + EPE: 1.83 + PCK@0.2: 1.0 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_zebra_160x160-e8cb2010_20210407.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: "Gr\xE9vy\u2019s Zebra" + Name: td-hm_res152_8xb32-210e_zebra-160x160 + Results: + - Dataset: "Gr\xE9vy\u2019s Zebra" + Metrics: + AUC: 0.921 + EPE: 1.67 + PCK@0.2: 1.0 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_zebra_160x160-05de71dd_20210407.pth diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py new file mode 100644 index 0000000..9a22a33 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=9, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'ZebraDataset' +data_mode = 'topdown' +data_root = 'data/zebra/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_factor=0.25, + rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/zebra_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/zebra_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py new file mode 100644 index 0000000..d1840b8 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=9, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'ZebraDataset' +data_mode = 'topdown' +data_root = 'data/zebra/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_factor=0.25, + rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/zebra_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/zebra_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py new file mode 100644 index 0000000..f9dc0e3 --- /dev/null +++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=9, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'ZebraDataset' +data_mode = 'topdown' +data_root = 'data/zebra/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_factor=0.25, + rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/zebra_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/zebra_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/README.md b/modules/rtmpose/configs/body_2d_keypoint/README.md new file mode 100644 index 0000000..15f244a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/README.md @@ -0,0 +1,21 @@ +# Human Body 2D Pose Estimation + +Multi-person human pose estimation is defined as the task of detecting the poses (or keypoints) of all people from an input image. + +Existing approaches can be categorized into top-down and bottom-up approaches. + +Top-down methods (e.g. DeepPose) divide the task into two stages: human detection and pose estimation. They perform human detection first, followed by single-person pose estimation given human bounding boxes. + +Bottom-up approaches (e.g. Associative Embedding) first detect all the keypoints and then group/associate them into person instances. + +## Data preparation + +Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_body_keypoint.md) to prepare data. + +## Demo + +Please follow [Demo](/demo/docs/en/2d_human_pose_demo.md#2d-human-pose-demo) to run demos. + +
+
+
diff --git a/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/README.md b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/README.md new file mode 100644 index 0000000..5592374 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/README.md @@ -0,0 +1,9 @@ +# Associative embedding: End-to-end learning for joint detection and grouping (AE) + +Associative Embedding is one of the most popular 2D bottom-up pose estimation approaches, that first detect all the keypoints and then group/associate them into person instances. + +In order to group all the predicted keypoints to individuals, a tag is also predicted for each detected keypoint. Tags of the same person are similar, while tags of different people are different. Thus the keypoints can be grouped according to the tags. + +
+ +
diff --git a/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py new file mode 100644 index 0000000..c20df08 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py @@ -0,0 +1,166 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1.5e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=300, + milestones=[200, 260], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=192) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', interval=50)) + +# codec settings +codec = dict( + type='AssociativeEmbedding', + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=2, + decode_topk=30, + decode_center_shift=0.5, + decode_keypoint_order=[ + 0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16 + ], + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='AssociativeEmbeddingHead', + in_channels=32, + num_keypoints=17, + tag_dim=1, + tag_per_keypoint=True, + deconv_out_channels=None, + keypoint_loss=dict(type='KeypointMSELoss', use_target_weight=True), + tag_loss=dict(type='AssociativeEmbeddingLoss', loss_weight=0.001), + # The heatmap will be resized to the input size before decoding + # if ``restore_heatmap_size==True`` + decoder=dict(codec, heatmap_size=codec['input_size'])), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + shift_heatmap=False, + restore_heatmap_size=True, + align_corners=False)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [] +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=64, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=24, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none', + score_mode='bbox', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.md new file mode 100644 index 0000000..57b6fe0 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.md @@ -0,0 +1,57 @@ + + +
+Associative Embedding (NIPS'2017) + +```bibtex +@inproceedings{newell2017associative, + title={Associative embedding: End-to-end learning for joint detection and grouping}, + author={Newell, Alejandro and Huang, Zhiao and Deng, Jia}, + booktitle={Advances in neural information processing systems}, + pages={2277--2287}, + year={2017} +} +``` + +
+ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [HRNet-w32](/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py) | 512x512 | 0.656 | 0.864 | 0.719 | 0.711 | 0.893 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.yml new file mode 100644 index 0000000..e1ca3f8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.yml @@ -0,0 +1,25 @@ +Collections: +- Name: AE + Paper: + Title: "Associative embedding: End-to-end learning for joint detection and grouping" + URL: https://arxiv.org/abs/1611.05424 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/associative_embedding.md +Models: +- Config: configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py + In Collection: AE + Metadata: + Architecture: + - AE + - HRNet + Training Data: COCO + Name: ae_hrnet-w32_8xb24-300e_coco-512x512 + Results: + - Dataset: COCO + Metrics: + AP: 0.656 + AP@0.5: 0.864 + AP@0.75: 0.719 + AR: 0.711 + AR@0.5: 0.893 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py new file mode 100644 index 0000000..9c6c4ce --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py @@ -0,0 +1,164 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=140, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=140, + milestones=[90, 120], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=160) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128)) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='CIDHead', + in_channels=480, + num_keypoints=17, + gfd_channels=32, + coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0), + decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0), + contrastive_loss=dict( + type='InfoNCELoss', temperature=0.05, loss_weight=1.0), + decoder=codec, + ), + train_cfg=dict(max_train_instances=200), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + shift_heatmap=False, + align_corners=False)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='BottomupGetHeatmapMask'), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=64, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=20, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_thr=0.8, + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py new file mode 100644 index 0000000..f8042b6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py @@ -0,0 +1,164 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=140, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=140, + milestones=[90, 120], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=160) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128)) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='CIDHead', + in_channels=720, + num_keypoints=17, + gfd_channels=48, + coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0), + decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0), + contrastive_loss=dict( + type='InfoNCELoss', temperature=0.05, loss_weight=1.0), + decoder=codec, + ), + train_cfg=dict(max_train_instances=200), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + shift_heatmap=False, + align_corners=False)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='BottomupGetHeatmapMask'), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=64, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=20, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_thr=0.8, + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.md new file mode 100644 index 0000000..97d83e2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.md @@ -0,0 +1,42 @@ + + +
+CID (CVPR'2022) + +```bibtex +@InProceedings{Wang_2022_CVPR, + author = {Wang, Dongkai and Zhang, Shiliang}, + title = {Contextual Instance Decoupling for Robust Multi-Person Pose Estimation}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2022}, + pages = {11060-11068} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py) | 512x512 | 0.704 | 0.894 | 0.775 | 0.753 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_42b7e6e6-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_20230207.json) | +| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py) | 512x512 | 0.715 | 0.900 | 0.782 | 0.765 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_a36c3ecf-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_20230207.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.yml new file mode 100644 index 0000000..efd5ee6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.yml @@ -0,0 +1,41 @@ +Collections: +- Name: CID + Paper: + Title: Contextual Instance Decoupling for Robust Multi-Person Pose Estimation + URL: https://openaccess.thecvf.com/content/CVPR2022/html/Wang_Contextual_Instance_Decoupling_for_Robust_Multi-Person_Pose_Estimation_CVPR_2022_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/cid.md +Models: +- Config: configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py + In Collection: CID + Metadata: + Architecture: &id001 + - CID + - HRNet + Training Data: COCO + Name: cid_hrnet-w32_8xb20-140e_coco-512x512 + Results: + - Dataset: COCO + Metrics: + AP: 0.704 + AP@0.5: 0.894 + AP@0.75: 0.775 + AR: 0.753 + AR@0.5: 0.928 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_42b7e6e6-20230207.pth +- Config: configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py + In Collection: CID + Metadata: + Architecture: *id001 + Training Data: COCO + Name: cid_hrnet-w48_8xb20-140e_coco-512x512 + Results: + - Dataset: COCO + Metrics: + AP: 0.715 + AP@0.5: 0.9 + AP@0.75: 0.782 + AR: 0.765 + AR@0.5: 0.935 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_a36c3ecf-20230207.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/README.md b/modules/rtmpose/configs/body_2d_keypoint/dekr/README.md new file mode 100644 index 0000000..e30a9e9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/README.md @@ -0,0 +1,22 @@ +# Bottom-up Human Pose Estimation via Disentangled Keypoint Regression (DEKR) + + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ +DEKR is a popular 2D bottom-up pose estimation approach that simultaneously detects all the instances and regresses the offsets from the instance centers to joints. + +In order to predict the offsets more accurately, the offsets of different joints are regressed using separated branches with deformable convolutional layers. Thus convolution kernels with different shapes are adopted to extract features for the corresponding joint. diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py new file mode 100644 index 0000000..9d94ac5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py @@ -0,0 +1,189 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=140, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=140, + milestones=[90, 120], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=80) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='SPR', + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, 2), + minimal_diagonal_length=32**0.5, + generate_keypoint_heatmaps=True, + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='DEKRHead', + in_channels=480, + num_keypoints=17, + heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True), + displacement_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + beta=1 / 9, + loss_weight=0.002, + ), + decoder=codec, + # This rescore net is adapted from the official repo. + # If you are not using the original COCO dataset for training, + # please make sure to remove the `rescore_cfg` item + rescore_cfg=dict( + in_channels=74, + norm_indexes=(5, 6), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_coco-33d58c5c.pth')), + ), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + nms_dist_thr=0.05, + shift_heatmap=True, + align_corners=False)) + +# enable DDP training when rescore net is used +find_unused_parameters = True + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='BottomupGetHeatmapMask'), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=32, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=10, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none', + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py new file mode 100644 index 0000000..c4255fd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py @@ -0,0 +1,190 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=140, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=140, + milestones=[90, 120], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=80) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='SPR', + input_size=(640, 640), + heatmap_size=(160, 160), + sigma=(4, 2), + minimal_diagonal_length=32**0.5, + generate_keypoint_heatmaps=True, + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='DEKRHead', + in_channels=720, + num_keypoints=17, + num_heatmap_filters=48, + heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True), + displacement_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + beta=1 / 9, + loss_weight=0.002, + ), + decoder=codec, + # This rescore net is adapted from the official repo. + # If you are not using the original COCO dataset for training, + # please make sure to remove the `rescore_cfg` item + rescore_cfg=dict( + in_channels=74, + norm_indexes=(5, 6), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_coco-33d58c5c.pth')), + ), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + nms_dist_thr=0.05, + shift_heatmap=True, + align_corners=False)) + +# enable DDP training when rescore net is used +find_unused_parameters = True + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='BottomupGetHeatmapMask'), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=32, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=10, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none', + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.md new file mode 100644 index 0000000..bb2e279 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.md @@ -0,0 +1,58 @@ + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [HRNet-w32](/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py) | 512x512 | 0.686 | 0.868 | 0.750 | 0.735 | 0.898 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_ac7c17bf-20221228.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_20221228.json) | +| [HRNet-w48](/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py) | 640x640 | 0.714 | 0.883 | 0.777 | 0.762 | 0.915 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_74796c32-20230124.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_20230124.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.yml new file mode 100644 index 0000000..f34a91d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.yml @@ -0,0 +1,41 @@ +Collections: +- Name: DEKR + Paper: + Title: Bottom-up human pose estimation via disentangled keypoint regression + URL: https://arxiv.org/abs/2104.02300 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/dekr.md +Models: +- Config: configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py + In Collection: DEKR + Metadata: + Architecture: &id001 + - DEKR + - HRNet + Training Data: COCO + Name: dekr_hrnet-w32_8xb10-140e_coco-512x512 + Results: + - Dataset: COCO + Metrics: + AP: 0.686 + AP@0.5: 0.868 + AP@0.75: 0.750 + AR: 0.735 + AR@0.5: 0.898 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_ac7c17bf-20221228.pth +- Config: configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py + In Collection: DEKR + Metadata: + Architecture: *id001 + Training Data: COCO + Name: dekr_hrnet-w48_8xb10-140e_coco-640x640 + Results: + - Dataset: COCO + Metrics: + AP: 0.714 + AP@0.5: 0.883 + AP@0.75: 0.777 + AR: 0.762 + AR@0.5: 0.915 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_74796c32-20230124.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py new file mode 100644 index 0000000..adfc36e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py @@ -0,0 +1,190 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=20) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=300, + milestones=[200, 260], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=80) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater')) + +# codec settings +codec = dict( + type='SPR', + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, 2), + minimal_diagonal_length=32**0.5, + generate_keypoint_heatmaps=True, + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='DEKRHead', + in_channels=480, + num_keypoints=14, + heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True), + displacement_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + beta=1 / 9, + loss_weight=0.004, + ), + decoder=codec, + # This rescore net is adapted from the official repo. + # If you are not using the original CrowdPose dataset for training, + # please make sure to remove the `rescore_cfg` item + rescore_cfg=dict( + in_channels=59, + norm_indexes=(0, 1), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth')), + ), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + nms_dist_thr=0.05, + shift_heatmap=True, + align_corners=False)) + +# enable DDP training when rescore net is used +find_unused_parameters = True + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'bottomup' +data_root = 'data/crowdpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=32, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=10, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/mmpose_crowdpose_test.json', + nms_mode='none', + score_mode='keypoint', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py new file mode 100644 index 0000000..89e1a4d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py @@ -0,0 +1,191 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=20) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=300, + milestones=[200, 260], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=40) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater')) + +# codec settings +codec = dict( + type='SPR', + input_size=(640, 640), + heatmap_size=(160, 160), + sigma=(4, 2), + minimal_diagonal_length=32**0.5, + generate_keypoint_heatmaps=True, + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='DEKRHead', + in_channels=720, + num_keypoints=14, + num_heatmap_filters=48, + heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True), + displacement_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + beta=1 / 9, + loss_weight=0.004, + ), + decoder=codec, + # This rescore net is adapted from the official repo. + # If you are not using the original CrowdPose dataset for training, + # please make sure to remove the `rescore_cfg` item + rescore_cfg=dict( + in_channels=59, + norm_indexes=(0, 1), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth')), + ), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + nms_dist_thr=0.05, + shift_heatmap=True, + align_corners=False)) + +# enable DDP training when rescore net is used +find_unused_parameters = True + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'bottomup' +data_root = 'data/crowdpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=32, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=5, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/mmpose_crowdpose_test.json', + nms_mode='none', + score_mode='keypoint', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.md new file mode 100644 index 0000000..9b1d251 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.md @@ -0,0 +1,56 @@ + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on CrowdPose test without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [HRNet-w32](/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py) | 512x512 | 0.663 | 0.857 | 0.714 | 0.740 | 0.671 | 0.576 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512_147bae97-20221228.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512_20221228.json) | +| [HRNet-w48](/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py) | 640x640 | 0.679 | 0.869 | 0.731 | 0.753 | 0.688 | 0.593 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_4ea6031e-20230128.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_20230128.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.yml new file mode 100644 index 0000000..c65d5a9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.yml @@ -0,0 +1,37 @@ +Models: +- Config: configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py + In Collection: DEKR + Metadata: + Architecture: &id001 + - DEKR + - HRNet + Training Data: CrowdPose + Name: dekr_hrnet-w32_8xb10-300e_crowdpose-512x512 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.663 + AP@0.5: 0.857 + AP@0.75: 0.714 + AP (E): 0.74 + AP (M): 0.671 + AP (L): 0.576 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512_147bae97-20221228.pth +- Config: configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py + In Collection: DEKR + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: dekr_hrnet-w48_8xb5-300e_crowdpose-640x640 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.679 + AP@0.5: 0.869 + AP@0.75: 0.731 + AP (E): 0.753 + AP (M): 0.688 + AP (L): 0.593 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_4ea6031e-20230128.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.md b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.md new file mode 100644 index 0000000..ffb4271 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.md @@ -0,0 +1,62 @@ + + +
+ED-Pose (ICLR'2023) + +```bibtex +@inproceedings{ +yang2023explicit, +title={Explicit Box Detection Unifies End-to-End Multi-Person Pose Estimation}, +author={Jie Yang and Ailing Zeng and Shilong Liu and Feng Li and Ruimao Zhang and Lei Zhang}, +booktitle={International Conference on Learning Representations}, +year={2023}, +url={https://openreview.net/forum?id=s4WVupnJjmX} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017. + +| Arch | BackBone | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :-------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :--------------------------------------------: | :-------------------------------------------: | +| [edpose_res50_coco](/configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py) | ResNet-50 | 0.716 | 0.897 | 0.783 | 0.793 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/edpose/coco/edpose_res50_coco_3rdparty.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/edpose/coco/edpose_res50_coco_3rdparty.json) | + +The checkpoint is converted from the official repo. The training of EDPose is not supported yet. It will be supported in the future updates. + +The above config follows [Pure Python style](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta). Please install `mmengine>=0.8.2` to use this config. diff --git a/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.yml new file mode 100644 index 0000000..bf461e3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.yml @@ -0,0 +1,26 @@ +Collections: +- Name: ED-Pose + Paper: + Title: Explicit Box Detection Unifies End-to-End Multi-Person Pose Estimation + URL: https://arxiv.org/pdf/2302.01593.pdf + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/edpose.md +Models: +- Config: configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py + In Collection: ED-Pose + Alias: edpose + Metadata: + Architecture: &id001 + - ED-Pose + - ResNet + Training Data: COCO + Name: edpose_res50_8xb2-50e_coco-800x1333 + Results: + - Dataset: COCO + Metrics: + AP: 0.716 + AP@0.5: 0.897 + AP@0.75: 0.783 + AR: 0.793 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/edpose/coco/edpose_res50_coco_3rdparty.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py new file mode 100644 index 0000000..0940b83 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py @@ -0,0 +1,236 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from mmpose.configs._base_.default_runtime import * # noqa + +from mmcv.transforms import RandomChoice, RandomChoiceResize +from mmengine.dataset import DefaultSampler +from mmengine.model import PretrainedInit +from mmengine.optim import LinearLR, MultiStepLR +from torch.nn import GroupNorm +from torch.optim import Adam + +from mmpose.codecs import EDPoseLabel +from mmpose.datasets import (BottomupRandomChoiceResize, BottomupRandomCrop, + CocoDataset, LoadImage, PackPoseInputs, + RandomFlip) +from mmpose.evaluation import CocoMetric +from mmpose.models import (BottomupPoseEstimator, ChannelMapper, EDPoseHead, + PoseDataPreprocessor, ResNet) +from mmpose.models.utils import FrozenBatchNorm2d + +# runtime +train_cfg.update(max_epochs=50, val_interval=10) # noqa + +# optimizer +optim_wrapper = dict(optimizer=dict( + type=Adam, + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict(type=LinearLR, begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type=MultiStepLR, + begin=0, + end=140, + milestones=[33, 45], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=80) + +# hooks +default_hooks.update( # noqa + checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict(type=EDPoseLabel, num_select=50, num_keypoints=17) + +# model settings +model = dict( + type=BottomupPoseEstimator, + data_preprocessor=dict( + type=PoseDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=FrozenBatchNorm2d, requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet50')), + neck=dict( + type=ChannelMapper, + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type=GroupNorm, num_groups=32), + num_outs=4), + head=dict( + type=EDPoseHead, + num_queries=900, + num_feature_levels=4, + num_keypoints=17, + as_two_stage=True, + encoder=dict( + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.0))), + decoder=dict( + num_layers=6, + embed_dims=256, + layer_cfg=dict( # DeformableDetrTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + batch_first=True), + cross_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.1)), + query_dim=4, + num_feature_levels=4, + num_group=100, + num_dn=100, + num_box_decoder_layers=2, + return_intermediate=True), + out_head=dict(num_classes=2), + positional_encoding=dict( + num_pos_feats=128, + temperatureH=20, + temperatureW=20, + normalize=True), + denosing_cfg=dict( + dn_box_noise_scale=0.4, + dn_label_noise_ratio=0.5, + dn_labelbook_size=100, + dn_attn_mask_type_list=['match2dn', 'dn2dn', 'group2group']), + data_decoder=codec), + test_cfg=dict(Pmultiscale_test=False, flip_test=False, num_select=50), + train_cfg=dict()) + +# enable DDP training when rescore net is used +find_unused_parameters = True + +# base dataset settings +dataset_type = CocoDataset +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type=LoadImage), + dict(type=RandomFlip, direction='horizontal'), + dict( + type=RandomChoice, + transforms=[ + [ + dict( + type=RandomChoiceResize, + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type=BottomupRandomChoiceResize, + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type=BottomupRandomCrop, + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type=BottomupRandomChoiceResize, + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type=PackPoseInputs), +] + +val_pipeline = [ + dict(type=LoadImage), + dict( + type=BottomupRandomChoiceResize, + scales=[(800, 1333)], + keep_ratio=True, + backend='pillow'), + dict( + type=PackPoseInputs, + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type=CocoMetric, + nms_mode='none', + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/README.md b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/README.md new file mode 100644 index 0000000..967b98e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/README.md @@ -0,0 +1,15 @@ +# Top-down integral-regression-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, integral regression based methods use a simple integral operation relates and unifies the heatmap and joint regression differentiably, thus obtain the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [Integral Human Pose Regression](https://arxiv.org/abs/1711.08229). + +## Results and Models + +### COCO Dataset + +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Model | Input Size | AP | AR | Details and Download | +| :------------------: | :--------: | :---: | :---: | :---------------------------------------------------: | +| ResNet-50+Debias-IPR | 256x256 | 0.675 | 0.765 | [resnet_debias_coco.md](./coco/resnet_debias_coco.md) | +| ResNet-50+DSNT | 256x256 | 0.674 | 0.764 | [resnet_dsnt_coco.md](./coco/resnet_dsnt_coco.md) | +| ResNet-50+IPR | 256x256 | 0.633 | 0.730 | [resnet_ipr_coco.md](./coco/resnet_ipr_coco.md) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py new file mode 100644 index 0000000..eb60eec --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py @@ -0,0 +1,134 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='IntegralRegressionLabel', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2.0, + normalize=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + ), + head=dict( + type='DSNTHead', + in_channels=2048, + in_featuremap_size=(8, 8), + num_joints=17, + loss=dict( + type='MultipleLossWrapper', + losses=[ + dict(type='SmoothL1Loss', use_target_weight=True), + dict(type='KeypointMSELoss', use_target_weight=True) + ]), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + shift_heatmap=True, + ), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth')) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +test_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py new file mode 100644 index 0000000..9a9cce5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py @@ -0,0 +1,136 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='IntegralRegressionLabel', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2.0, + normalize=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + ), + head=dict( + type='DSNTHead', + in_channels=2048, + in_featuremap_size=(8, 8), + num_joints=17, + debias=True, + beta=10., + loss=dict( + type='MultipleLossWrapper', + losses=[ + dict(type='SmoothL1Loss', use_target_weight=True), + dict(type='JSDiscretLoss', use_target_weight=True) + ]), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + shift_heatmap=True, + ), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth')) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +test_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py new file mode 100644 index 0000000..7b262d9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py @@ -0,0 +1,134 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='IntegralRegressionLabel', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2.0, + normalize=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + ), + head=dict( + type='DSNTHead', + in_channels=2048, + in_featuremap_size=(8, 8), + num_joints=17, + loss=dict( + type='MultipleLossWrapper', + losses=[ + dict(type='SmoothL1Loss', use_target_weight=True), + dict(type='JSDiscretLoss', use_target_weight=True) + ]), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + shift_heatmap=True, + ), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth')) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +test_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.md b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.md new file mode 100644 index 0000000..406d2b6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.md @@ -0,0 +1,57 @@ + + +
+Debias IPR (ICCV'2021) + +```bibtex +@inproceedings{gu2021removing, + title={Removing the Bias of Integral Pose Regression}, + author={Gu, Kerui and Yang, Linlin and Yao, Angela}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={11067--11076}, + year={2021} + } +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [debias-ipr_resnet_50](/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py) | 256x256 | 0.675 | 0.872 | 0.740 | 0.765 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256-055a7699_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256-055a7699_20220913.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.yml new file mode 100644 index 0000000..155cdbf --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.yml @@ -0,0 +1,25 @@ +Collections: +- Name: DebiasIPR + Paper: + Title: Removing the Bias of Integral Pose Regression + URL: https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_Removing_the_Bias_of_Integral_Pose_Regression_ICCV_2021_paper.pdf + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/debias_ipr.md +Models: +- Config: configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias--8xb64-210e_coco-256x256.py + In Collection: DebiasIPR + Metadata: + Architecture: &id001 + - Debias + - ResNet + Training Data: COCO + Name: ipr_res50_debias--8xb64-210e_coco-256x256 + Results: + - Dataset: COCO + Metrics: + AP: 0.675 + AP@0.5: 0.872 + AP@0.75: 0.74 + AR: 0.765 + AR@0.5: 0.928 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256-055a7699_20220913.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.md b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.md new file mode 100644 index 0000000..d59266b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.md @@ -0,0 +1,56 @@ + + +
+DSNT (2018) + +```bibtex +@article{nibali2018numerical, + title={Numerical Coordinate Regression with Convolutional Neural Networks}, + author={Nibali, Aiden and He, Zhen and Morgan, Stuart and Prendergast, Luke}, + journal={arXiv preprint arXiv:1801.07372}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [ipr_resnet_50_dsnt](/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py) | 256x256 | 0.674 | 0.870 | 0.744 | 0.764 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256-441eedc0_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256-441eedc0_20220913.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.yml new file mode 100644 index 0000000..fa772e8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.yml @@ -0,0 +1,25 @@ +Collections: +- Name: DSNT + Paper: + Title: Numerical Coordinate Regression with Convolutional Neural Networks + URL: https://arxiv.org/abs/1801.07372v2 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/dsnt.md +Models: +- Config: configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py + In Collection: DSNT + Metadata: + Architecture: &id001 + - DSNT + - ResNet + Training Data: COCO + Name: ipr_res50_dsnt-8xb64-210e_coco-256x256 + Results: + - Dataset: COCO + Metrics: + AP: 0.674 + AP@0.5: 0.87 + AP@0.75: 0.744 + AR: 0.764 + AR@0.5: 0.928 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256-441eedc0_20220913.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.md b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.md new file mode 100644 index 0000000..d51e5df --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.md @@ -0,0 +1,57 @@ + + +
+IPR (ECCV'2018) + +```bibtex +@inproceedings{sun2018integral, + title={Integral human pose regression}, + author={Sun, Xiao and Xiao, Bin and Wei, Fangyin and Liang, Shuang and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={529--545}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [ipr_resnet_50](/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py) | 256x256 | 0.633 | 0.860 | 0.703 | 0.730 | 0.919 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256-a3898a33_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256-a3898a33_20220913.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.yml new file mode 100644 index 0000000..d40d190 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.yml @@ -0,0 +1,25 @@ +Collections: +- Name: IPR + Paper: + Title: Integral human pose regression + URL: https://arxiv.org/abs/1711.08229 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/ipr.md +Models: +- Config: configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py + In Collection: IPR + Metadata: + Architecture: &id001 + - IPR + - ResNet + Training Data: COCO + Name: ipr_res50_8xb64-210e_coco-256x256 + Results: + - Dataset: COCO + Metrics: + AP: 0.633 + AP@0.5: 0.86 + AP@0.75: 0.703 + AR: 0.73 + AR@0.5: 0.919 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256-a3898a33_20220913.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/README.md b/modules/rtmpose/configs/body_2d_keypoint/rtmo/README.md new file mode 100644 index 0000000..cd5b26f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/README.md @@ -0,0 +1,27 @@ +# RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation + + + +
+RTMO + +```bibtex +@misc{lu2023rtmo, + title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation}, + author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang}, + year={2023}, + eprint={2312.07526}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ +RTMO is a one-stage pose estimation model that seamlessly integrates coordinate classification into the YOLO architecture. It introduces a Dynamic Coordinate Classifier (DCC) module that handles keypoint localization through dual 1D heatmaps. The DCC employs dynamic bin allocation, localizing the coordinate bins to each predicted bounding box to improve efficiency. It also uses learnable bin representations based on positional encodings, enabling computation of bin-keypoint similarity for precise localization. + +RTMO is trained end-to-end using a multi-task loss, with losses for bounding box regression, keypoint heatmap classification via a novel MLE loss, keypoint coordinate proxy regression, and keypoint visibility classification. The MLE loss models annotation uncertainty and balances optimization between easy and hard samples. + +During inference, RTMO employs grid-based dense predictions to simultaneously output human detection boxes and poses in a single pass. It selectively decodes heatmaps only for high-scoring grids after NMS, minimizing computational cost. + +Compared to prior one-stage methods that regress keypoint coordinates directly, RTMO achieves higher accuracy through coordinate classification while retaining real-time speeds. It also outperforms lightweight top-down approaches for images with many people, as the latter have inference times that scale linearly with the number of human instances. diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py new file mode 100644 index 0000000..e62ef17 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py @@ -0,0 +1,533 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=280, + end=280, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=281, + T_max=300, + end=580, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/coco.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +# data settings +data_mode = 'bottomup' +data_root = 'data/' + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[(i, i) for i in range(17)]) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +train_dataset = dict( + type='CombinedDataset', + metainfo=dict(from_file=metafile), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3], + test_mode=False, + pipeline=train_pipeline_stage1) + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset) + +# val datasets +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json', + score_mode='bbox', + nms_mode='none', +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=20, + new_train_dataset=dataset_coco, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 280: { + 'proxy_target_cc': True, + 'overlaps_power': 1.0, + 'loss_cls.loss_weight': 2.0, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 1.0 +deepen_factor = 1.0 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco' + '_20211126_140236-d3bd2b23.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[256, 512, 1024], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=512, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=17, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=512, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile)), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=512, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1e-2, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py new file mode 100644 index 0000000..ffddaab --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py @@ -0,0 +1,532 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=280, + end=280, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=281, + T_max=300, + end=580, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/coco.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +# data settings +data_mode = 'bottomup' +data_root = 'data/' + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[(i, i) for i in range(17)]) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +train_dataset = dict( + type='CombinedDataset', + metainfo=dict(from_file=metafile), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3], + test_mode=False, + pipeline=train_pipeline_stage1) + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset) + +# val datasets +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json', + score_mode='bbox', + nms_mode='none', +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=20, + new_train_dataset=dataset_coco, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 280: { + 'proxy_target_cc': True, + 'overlaps_power': 1.0, + 'loss_cls.loss_weight': 2.0, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 0.75 +deepen_factor = 0.67 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/v1/' + 'pretrained_models/yolox_m_8x8_300e_coco_20230829.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[192, 384, 768], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=384, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=17, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=384, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile)), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=384, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1e-2, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py new file mode 100644 index 0000000..7d132fd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py @@ -0,0 +1,535 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=280, + end=280, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=281, + T_max=300, + end=580, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/coco.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_prob=0, + rotate_prob=0, + scale_prob=0, + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +# data settings +data_mode = 'bottomup' +data_root = 'data/' + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[(i, i) for i in range(17)]) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +train_dataset = dict( + type='CombinedDataset', + metainfo=dict(from_file=metafile), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3], + test_mode=False, + pipeline=train_pipeline_stage1) + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset) + +# val datasets +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json', + score_mode='bbox', + nms_mode='none', +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=20, + new_train_dataset=dataset_coco, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 280: { + 'proxy_target_cc': True, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 0.5 +deepen_factor = 0.33 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_' + '20211121_095711-4592a793.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[128, 256, 512], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=17, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=256, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile), + use_keypoints_for_center=True), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=256, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1.0, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py new file mode 100644 index 0000000..ec719a7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py @@ -0,0 +1,529 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=280, + end=280, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=281, + T_max=300, + end=580, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600), +] + +# data +input_size = (416, 416) +metafile = 'configs/_base_/datasets/coco.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(416, 416), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(416, 416), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(416, 416), + shift_prob=0, + rotate_prob=0, + scale_prob=0, + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +# data settings +data_mode = 'bottomup' +data_root = 'data/' + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[(i, i) for i in range(17)]) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +train_dataset = dict( + type='CombinedDataset', + metainfo=dict(from_file=metafile), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3], + test_mode=False, + pipeline=train_pipeline_stage1) + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset) + +# val datasets +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json', + score_mode='bbox', + nms_mode='none', +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=20, + new_train_dataset=dataset_coco, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 280: { + 'proxy_target_cc': True, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 0.375 +deepen_factor = 0.33 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_' + '20211124_171234-b4047906.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[96, 192, 384], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=192, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=17, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=192, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile), + use_keypoints_for_center=True), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=192, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1.0, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.md b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.md new file mode 100644 index 0000000..59bbfee --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.md @@ -0,0 +1,132 @@ + + +
+RTMO + +```bibtex +@misc{lu2023rtmo, + title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation}, + author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang}, + year={2023}, + eprint={2312.07526}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+AI Challenger (ArXiv'2017) + +```bibtex +@article{wu2017ai, + title={Ai challenger: A large-scale dataset for going deeper in image understanding}, + author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others}, + journal={arXiv preprint arXiv:1711.06475}, + year={2017} +} +``` + +
+ +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +
+JHMDB (ICCV'2013) + +```bibtex +@inproceedings{Jhuang:ICCV:2013, + title = {Towards understanding action recognition}, + author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black}, + booktitle = {International Conf. on Computer Vision (ICCV)}, + month = Dec, + pages = {3192-3199}, + year = {2013} +} +``` + +
+ +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +
+PoseTrack18 (CVPR'2018) + +```bibtex +@inproceedings{andriluka2018posetrack, + title={Posetrack: A benchmark for human pose estimation and tracking}, + author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={5167--5176}, + year={2018} +} +``` + +
+ +
+Halpe (CVPR'2020) + +```bibtex +@inproceedings{li2020pastanet, + title={PaStaNet: Toward Human Activity Knowledge Engine}, + author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu}, + booktitle={CVPR}, + year={2020} +} +``` + +
+ +Results on COCO val2017 + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | onnx | +| :--------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :--------------------------------: | :-------------------------------: | :--------------------------------: | +| [RTMO-t](/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py) | 640x640 | 0.574 | 0.803 | 0.613 | 0.611 | 0.836 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-t_8xb32-600e_body7-416x416-f48f75cb_20231219.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-t_8xb32-600e_body7-416x416_20231219.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-t_8xb32-600e_body7-416x416-f48f75cb_20231219.zip) | +| [RTMO-s](/configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py) | 640x640 | 0.686 | 0.879 | 0.744 | 0.723 | 0.908 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_body7-640x640-dac2bf74_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_body7-640x640_20231211.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-s_8xb32-600e_body7-640x640-dac2bf74_20231211.zip) | +| [RTMO-m](/configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py) | 640x640 | 0.726 | 0.899 | 0.790 | 0.763 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_body7-640x640-39e78cc4_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_body7-640x640_20231211.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-m_16xb16-600e_body7-640x640-39e78cc4_20231211.zip) | +| [RTMO-l](/configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py) | 640x640 | 0.748 | 0.911 | 0.813 | 0.786 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_body7-640x640-b37118ce_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_body7-640x640_20231211.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-l_16xb16-600e_body7-640x640-b37118ce_20231211.zip) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.yml new file mode 100644 index 0000000..046db3b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.yml @@ -0,0 +1,74 @@ +Models: +- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py + In Collection: RTMO + Metadata: + Architecture: &id001 + - RTMO + Training Data: &id002 + - AI Challenger + - COCO + - CrowdPose + - MPII + - sub-JHMDB + - Halpe + - PoseTrack18 + Name: rtmo-t_8xb32-600e_body7-416x416 + Results: + - Dataset: COCO + Metrics: + AP: 0.574 + AP@0.5: 0.803 + AP@0.75: 0.613 + AR: 0.611 + AR@0.5: 0.836 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-t_8xb32-600e_body7-416x416-f48f75cb_20231219.pth +- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py + In Collection: RTMO + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmo-s_8xb32-600e_body7-640x640 + Results: + - Dataset: COCO + Metrics: + AP: 0.686 + AP@0.5: 0.879 + AP@0.75: 0.744 + AR: 0.723 + AR@0.5: 0.908 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_body7-640x640-dac2bf74_20231211.pth +- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py + In Collection: RTMO + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmo-m_16xb16-600e_body7-640x640 + Results: + - Dataset: COCO + Metrics: + AP: 0.726 + AP@0.5: 0.899 + AP@0.75: 0.790 + AR: 0.763 + AR@0.5: 0.926 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_body7-640x640-39e78cc4_20231211.pth +- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py + In Collection: RTMO + Alias: rtmo + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmo-l_16xb16-600e_body7-640x640 + Results: + - Dataset: COCO + Metrics: + AP: 0.748 + AP@0.5: 0.911 + AP@0.75: 0.813 + AR: 0.786 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_body7-640x640-b37118ce_20231211.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py new file mode 100644 index 0000000..c310068 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py @@ -0,0 +1,321 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=280, + end=280, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=281, + T_max=300, + end=580, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/coco.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +data_mode = 'bottomup' +data_root = 'data/' + +# train datasets +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=train_pipeline_stage1, +) + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dataset_coco) + +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json', + score_mode='bbox', + nms_mode='none', +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=20, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 280: { + 'proxy_target_cc': True, + 'overlaps_power': 1.0, + 'loss_cls.loss_weight': 2.0, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 1.0 +deepen_factor = 1.0 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco' + '_20211126_140236-d3bd2b23.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[256, 512, 1024], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=512, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=17, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=512, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile)), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=512, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1e-2, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py new file mode 100644 index 0000000..fa158e2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py @@ -0,0 +1,320 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=280, + end=280, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=281, + T_max=300, + end=580, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/coco.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +data_mode = 'bottomup' +data_root = 'data/' + +# train datasets +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=train_pipeline_stage1, +) + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dataset_coco) + +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json', + score_mode='bbox', + nms_mode='none', +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=20, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 280: { + 'proxy_target_cc': True, + 'overlaps_power': 1.0, + 'loss_cls.loss_weight': 2.0, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 0.75 +deepen_factor = 0.67 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/v1/' + 'pretrained_models/yolox_m_8x8_300e_coco_20230829.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[192, 384, 768], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=384, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=17, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=384, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile)), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=384, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1e-2, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py new file mode 100644 index 0000000..283cf68 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py @@ -0,0 +1,323 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=280, + end=280, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=281, + T_max=300, + end=580, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/coco.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_prob=0, + rotate_prob=0, + scale_prob=0, + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +data_mode = 'bottomup' +data_root = 'data/' + +# train datasets +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=train_pipeline_stage1, +) + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dataset_coco) + +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json', + score_mode='bbox', + nms_mode='none', +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=20, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 280: { + 'proxy_target_cc': True, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 0.5 +deepen_factor = 0.33 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_' + '20211121_095711-4592a793.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[128, 256, 512], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=17, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=256, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile), + use_keypoints_for_center=True), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=256, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1.0, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.md b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.md new file mode 100644 index 0000000..6f8d410 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.md @@ -0,0 +1,43 @@ + + +
+RTMO + +```bibtex +@misc{lu2023rtmo, + title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation}, + author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang}, + year={2023}, + eprint={2312.07526}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [RTMO-s](/configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py) | 640x640 | 0.677 | 0.878 | 0.737 | 0.715 | 0.908 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640_20231211.json) | +| [RTMO-m](/configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py) | 640x640 | 0.709 | 0.890 | 0.778 | 0.747 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_coco-640x640-6f4e0306_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_coco-640x640_20231211.json) | +| [RTMO-l](/configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py) | 640x640 | 0.724 | 0.899 | 0.788 | 0.762 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_coco-640x640-516a421f_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_coco-640x640_20231211.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.yml new file mode 100644 index 0000000..6b48504 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.yml @@ -0,0 +1,56 @@ +Collections: +- Name: RTMO + Paper: + Title: 'RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation' + URL: https://arxiv.org/abs/2312.07526 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/rtmo.md +Models: +- Config: configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py + In Collection: RTMO + Metadata: + Architecture: &id001 + - RTMO + Training Data: CrowdPose + Name: rtmo-s_8xb32-600e_coco-640x640 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.673 + AP@0.5: 0.878 + AP@0.75: 0.737 + AR: 0.715 + AR@0.5: 0.908 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth +- Config: configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py + In Collection: RTMO + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: rtmo-m_16xb16-600e_coco-640x640 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.709 + AP@0.5: 0.890 + AP@0.75: 0.778 + AR: 0.747 + AR@0.5: 0.920 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_coco-640x640-6f4e0306_20231211.pth +- Config: configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py + In Collection: RTMO + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: rtmo-l_16xb16-600e_coco-640x640 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.724 + AP@0.5: 0.899 + AP@0.75: 0.788 + AR: 0.762 + AR@0.5: 0.927 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_coco-640x640-516a421f_20231211.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py new file mode 100644 index 0000000..f1cc089 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py @@ -0,0 +1,502 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=350, + end=349, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=350, + T_max=320, + end=670, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/crowdpose.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_prob=0, + rotate_prob=0, + scale_prob=0, + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +# data settings +data_mode = 'bottomup' +data_root = 'data/' + +# mapping +aic_crowdpose = [(3, 0), (0, 1), (4, 2), (1, 3), (5, 4), (2, 5), + (9, 6), (6, 7), (10, 8), (7, 9), (11, 10), (8, 11), (12, 12), + (13, 13)] + +coco_crowdpose = [ + (5, 0), + (6, 1), + (7, 2), + (8, 3), + (9, 4), + (10, 5), + (11, 6), + (12, 7), + (13, 8), + (14, 9), + (15, 10), + (16, 11), +] + +mpii_crowdpose = [ + (13, 0), + (12, 1), + (14, 2), + (11, 3), + (15, 4), + (10, 5), + (3, 6), + (2, 7), + (4, 8), + (1, 9), + (5, 10), + (0, 11), + (9, 12), + (7, 13), +] + +jhmdb_crowdpose = [(4, 0), (3, 1), (8, 2), (7, 3), (12, 4), (11, 5), (6, 6), + (5, 7), (10, 8), (9, 9), (14, 10), (13, 11), (2, 12), + (0, 13)] + +halpe_crowdpose = [ + (5, 0), + (6, 1), + (7, 2), + (8, 3), + (9, 4), + (10, 5), + (11, 6), + (12, 7), + (13, 8), + (14, 9), + (15, 10), + (16, 11), +] + +posetrack_crowdpose = [ + (5, 0), + (6, 1), + (7, 2), + (8, 3), + (9, 4), + (10, 5), + (11, 6), + (12, 7), + (13, 8), + (14, 9), + (15, 10), + (16, 11), + (2, 12), + (1, 13), +] + +# train datasets +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=14, mapping=coco_crowdpose) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=14, mapping=aic_crowdpose) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=14, + mapping=[(i, i) for i in range(14)]) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=14, mapping=mpii_crowdpose) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=14, + mapping=jhmdb_crowdpose) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=14, + mapping=halpe_crowdpose) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=14, + mapping=posetrack_crowdpose) + ], +) + +train_dataset_stage1 = dict( + type='CombinedDataset', + metainfo=dict(from_file=metafile), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + sample_ratio_factor=[1, 0.3, 1, 0.3, 0.3, 0.4, 0.3], + test_mode=False, + pipeline=train_pipeline_stage1) + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset_stage1) + +# val datasets +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + score_mode='bbox', + nms_mode='none', + iou_type='keypoints_crowd', + prefix='crowdpose', + use_area=False, +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=30, + new_train_dataset=dataset_crowdpose, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 350: { + 'proxy_target_cc': True, + 'overlaps_power': 1.0, + 'loss_cls.loss_weight': 2.0, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 1.0 +deepen_factor = 1.0 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco' + '_20211126_140236-d3bd2b23.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[256, 512, 1024], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=512, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=14, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=512, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile)), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=512, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1e-3, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py new file mode 100644 index 0000000..d944cca --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py @@ -0,0 +1,326 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=350, + end=349, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=350, + T_max=320, + end=670, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/crowdpose.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.2, + rotate_factor=30, + scale_factor=(0.5, 1.5), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.6, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_prob=0, + rotate_prob=0, + scale_prob=0, + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +data_mode = 'bottomup' +data_root = 'data/' + +# train datasets +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=train_pipeline_stage1, +) + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dataset_crowdpose) + +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + score_mode='bbox', + nms_mode='none', + iou_type='keypoints_crowd', + prefix='crowdpose', + use_area=False, +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=30, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 350: { + 'proxy_target_cc': True, + 'overlaps_power': 1.0, + 'loss_cls.loss_weight': 2.0, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 1.0 +deepen_factor = 1.0 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco' + '_20211126_140236-d3bd2b23.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[256, 512, 1024], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=512, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=14, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=512, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile)), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=512, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1e-3, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py new file mode 100644 index 0000000..0b925f4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py @@ -0,0 +1,325 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=350, + end=349, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=350, + T_max=320, + end=670, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/crowdpose.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.2, + rotate_factor=30, + scale_factor=(0.5, 1.5), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.6, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_prob=0, + rotate_prob=0, + scale_prob=0, + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +data_mode = 'bottomup' +data_root = 'data/' + +# train datasets +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=train_pipeline_stage1, +) + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dataset_crowdpose) + +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + score_mode='bbox', + nms_mode='none', + iou_type='keypoints_crowd', + prefix='crowdpose', + use_area=False, +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=30, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 350: { + 'proxy_target_cc': True, + 'overlaps_power': 1.0, + 'loss_cls.loss_weight': 2.0, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 0.75 +deepen_factor = 0.67 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/v1/' + 'pretrained_models/yolox_m_8x8_300e_coco_20230829.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[192, 384, 768], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=384, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=14, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=384, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile)), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=384, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1e-3, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py new file mode 100644 index 0000000..4121f55 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py @@ -0,0 +1,326 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + constructor='ForceDefaultOptimWrapperConstructor', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + force_default_settings=True, + custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=350, + end=349, + by_epoch=True, + convert_to_iter_based=True), + # this scheduler is used to increase the lr from 2e-4 to 5e-4 + dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=350, + T_max=320, + end=670, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700), +] + +# data +input_size = (640, 640) +metafile = 'configs/_base_/datasets/crowdpose.py' +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.2, + rotate_factor=30, + scale_factor=(0.5, 1.5), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.6, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_prob=0, + rotate_prob=0, + scale_prob=0, + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='BottomupGetHeatmapMask', get_invalid=True), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +data_mode = 'bottomup' +data_root = 'data/' + +# train datasets +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=train_pipeline_stage1, +) + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dataset_crowdpose) + +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + score_mode='bbox', + nms_mode='none', + iou_type='keypoints_crowd', + prefix='crowdpose', + use_area=False, +) +test_evaluator = val_evaluator + +# hooks +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=30, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict( + type='RTMOModeSwitchHook', + epoch_attributes={ + 350: { + 'proxy_target_cc': True, + 'overlaps_power': 1.0, + 'loss_cls.loss_weight': 2.0, + 'loss_mle.loss_weight': 5.0, + 'loss_oks.loss_weight': 10.0 + }, + }, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] + +# model +widen_factor = 0.5 +deepen_factor = 0.33 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_' + '20211121_095711-4592a793.pth', + prefix='backbone.', + )), + neck=dict( + type='HybridEncoder', + in_channels=[128, 256, 512], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_dim=256, + output_indices=[1, 2], + encoder_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + act_cfg=dict(type='GELU'))), + projector=dict( + type='ChannelMapper', + in_channels=[256, 256], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='BN'), + num_outs=2)), + head=dict( + type='RTMOHead', + num_keypoints=14, + featmap_strides=(16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + cls_feat_channels=256, + channels_per_group=36, + pose_vec_channels=256, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + assigner=dict( + type='SimOTAAssigner', + dynamic_k_indicator='oks', + oks_calculator=dict(type='PoseOKS', metainfo=metafile)), + prior_generator=dict( + type='MlvlPointGenerator', + centralize_points=True, + strides=[16, 32]), + dcc_cfg=dict( + in_channels=256, + feat_channels=128, + num_bins=(192, 256), + spe_channels=128, + gau_cfg=dict( + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + pos_enc='add')), + overlaps_power=0.5, + loss_cls=dict( + type='VariFocalLoss', + reduction='sum', + use_target_weight=True, + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo=metafile, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_mle=dict( + type='MLECCLoss', + use_target_weight=True, + loss_weight=1e-3, + ), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + input_size=input_size, + score_thr=0.1, + nms_thr=0.65, + )) diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.md new file mode 100644 index 0000000..8c95c5e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.md @@ -0,0 +1,44 @@ + + +
+RTMO + +```bibtex +@misc{lu2023rtmo, + title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation}, + author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang}, + year={2023}, + eprint={2312.07526}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on COCO val2017 + +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [RTMO-s](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py) | 640x640 | 0.673 | 0.882 | 0.729 | 0.737 | 0.682 | 0.591 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-700e_crowdpose-640x640-79f81c0d_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-700e_crowdpose-640x640_20231211.json) | +| [RTMO-m](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py) | 640x640 | 0.711 | 0.897 | 0.771 | 0.774 | 0.719 | 0.634 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rrtmo-m_16xb16-700e_crowdpose-640x640-0eaf670d_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-700e_crowdpose-640x640_20231211.json) | +| [RTMO-l](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py) | 640x640 | 0.732 | 0.907 | 0.793 | 0.792 | 0.741 | 0.653 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_crowdpose-640x640-1008211f_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_crowdpose-640x640_20231211.json) | +| [RTMO-l](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py)\* | 640x640 | 0.838 | 0.947 | 0.893 | 0.888 | 0.847 | 0.772 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_body7-crowdpose-640x640-5bafdc11_20231219.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_body7-crowdpose-640x640_20231219.json) | + +\* indicates the model is trained using a combined dataset composed of AI Challenger, COCO, CrowdPose, Halpe, MPII, PoseTrack18 and sub-JHMDB. diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.yml new file mode 100644 index 0000000..8c7804b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.yml @@ -0,0 +1,70 @@ +Models: +- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py + In Collection: RTMO + Metadata: + Architecture: &id001 + - RTMO + Training Data: CrowdPose + Name: rtmo-s_8xb32-700e_crowdpose-640x640 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.673 + AP@0.5: 0.882 + AP@0.75: 0.729 + AP (E): 0.737 + AP (M): 0.682 + AP (L): 0.591 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-700e_crowdpose-640x640-79f81c0d_20231211.pth +- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py + In Collection: RTMO + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: rtmo-m_16xb16-700e_crowdpose-640x640 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.711 + AP@0.5: 0.897 + AP@0.75: 0.771 + AP (E): 0.774 + AP (M): 0.719 + AP (L): 0.634 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rrtmo-m_16xb16-700e_crowdpose-640x640-0eaf670d_20231211.pth +- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py + In Collection: RTMO + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: rtmo-l_16xb16-700e_crowdpose-640x640 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.732 + AP@0.5: 0.907 + AP@0.75: 0.793 + AP (E): 0.792 + AP (M): 0.741 + AP (L): 0.653 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_crowdpose-640x640-1008211f_20231211.pth +- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py + In Collection: RTMO + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: rtmo-l_16xb16-700e_body7-crowdpose-640x640 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.838 + AP@0.5: 0.947 + AP@0.75: 0.893 + AP (E): 0.888 + AP (M): 0.847 + AP (L): 0.772 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_body7-crowdpose-640x640-5bafdc11_20231219.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000..19e4c68 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/README.md @@ -0,0 +1,57 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### COCO Dataset + +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Model | Input Size | AP | AR | Details and Download | +| :----------------: | :--------: | :---: | :---: | :---------------------------------------: | +| RTMPose-t | 256x192 | 0.682 | 0.736 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-s | 256x192 | 0.716 | 0.768 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-m | 256x192 | 0.746 | 0.795 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-l | 256x192 | 0.758 | 0.806 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-t-aic-coco | 256x192 | 0.685 | 0.738 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-s-aic-coco | 256x192 | 0.722 | 0.772 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-m-aic-coco | 256x192 | 0.758 | 0.806 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-l-aic-coco | 256x192 | 0.765 | 0.813 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-m-aic-coco | 384x288 | 0.770 | 0.816 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-l-aic-coco | 384x288 | 0.773 | 0.819 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | + +### MPII Dataset + +| Model | Input Size | PCKh@0.5 | PCKh@0.1 | Details and Download | +| :-------: | :--------: | :------: | :------: | :---------------------------------------: | +| RTMPose-m | 256x256 | 0.907 | 0.348 | [rtmpose_mpii.md](./mpii/rtmpose_mpii.md) | + +### CrowdPose Dataset + +Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector + +| Model | Input Size | AP | AR | Details and Download | +| :-------: | :--------: | :---: | :---: | :------------------------------------------------------: | +| RTMPose-m | 256x192 | 0.706 | 0.788 | [rtmpose_crowdpose.md](./crowdpose/rtmpose_crowdpose.md) | + +### Human-Art Dataset + +Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset + +| Model | Input Size | AP | AR | Details and Download | +| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: | +| RTMPose-s | 256x192 | 0.311 | 0.381 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) | +| RTMPose-m | 256x192 | 0.355 | 0.417 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) | +| RTMPose-l | 256x192 | 0.378 | 0.442 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) | + +Results on Human-Art validation dataset with ground-truth bounding-box + +| Model | Input Size | AP | AR | Details and Download | +| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: | +| RTMPose-s | 256x192 | 0.698 | 0.732 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) | +| RTMPose-m | 256x192 | 0.728 | 0.759 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) | +| RTMPose-l | 256x192 | 0.753 | 0.783 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py new file mode 100644 index 0000000..ea72ca7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py @@ -0,0 +1,553 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 20 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-l_udp-body7_210e-256x192-5e9558ef_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) + +test_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) +# default_hooks = dict( +# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = [ + dict(type='PCKAccuracy', thr=0.1), + dict(type='AUC'), + dict(type='EPE'), +] diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py new file mode 100644 index 0000000..6ffcd6e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py @@ -0,0 +1,553 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 20 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-l_udp-body7_210e-384x288-b15bc30d_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) + +test_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) +# default_hooks = dict( +# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = [ + dict(type='PCKAccuracy', thr=0.1), + dict(type='AUC'), + dict(type='EPE'), +] diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py new file mode 100644 index 0000000..2a069ba --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py @@ -0,0 +1,535 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 26 +input_size = (192, 256) + +# runtime +max_epochs = 700 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 512 +val_batch_size = 64 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-256x192-4dba18fc_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping +coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24), + (20, 21), (21, 23), (22, 25)] + +aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), + (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15), + (12, 17), (13, 18)] + +crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17), + (13, 18)] + +mpii_halpe26 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (8, 18), + (9, 17), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_halpe26 = [ + (0, 18), + (2, 17), + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_halpe26 = [(i, i) for i in range(26)] + +ochuman_halpe26 = [(i, i) for i in range(17)] + +posetrack_halpe26 = [ + (0, 0), + (2, 17), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=5, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=ochuman_halpe26) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=5, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')] +val_evaluator = test_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py new file mode 100644 index 0000000..ae75a5e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py @@ -0,0 +1,535 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 26 +input_size = (288, 384) + +# runtime +max_epochs = 700 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 512 +val_batch_size = 64 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-384x288-3f5a1437_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping +coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24), + (20, 21), (21, 23), (22, 25)] + +aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), + (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15), + (12, 17), (13, 18)] + +crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17), + (13, 18)] + +mpii_halpe26 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (8, 18), + (9, 17), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_halpe26 = [ + (0, 18), + (2, 17), + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_halpe26 = [(i, i) for i in range(26)] + +ochuman_halpe26 = [(i, i) for i in range(17)] + +posetrack_halpe26 = [ + (0, 0), + (2, 17), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=ochuman_halpe26) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')] +val_evaluator = test_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py new file mode 100644 index 0000000..c96ba39 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py @@ -0,0 +1,553 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 20 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-body7_210e-256x192-e0c9327b_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) + +test_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) +# default_hooks = dict( +# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = [ + dict(type='PCKAccuracy', thr=0.1), + dict(type='AUC'), + dict(type='EPE') +] diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py new file mode 100644 index 0000000..4118d7d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py @@ -0,0 +1,553 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 20 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-body7_210e-384x288-b9bc2b57_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0.0, + drop_path=0.0, + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) + +test_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) +# default_hooks = dict( +# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = [ + dict(type='PCKAccuracy', thr=0.1), + dict(type='AUC'), + dict(type='EPE') +] diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py new file mode 100644 index 0000000..cad89e4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py @@ -0,0 +1,529 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 26 +input_size = (192, 256) + +# runtime +max_epochs = 700 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 512 +val_batch_size = 64 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# mapping +coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24), + (20, 21), (21, 23), (22, 25)] + +aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), + (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15), + (12, 17), (13, 18)] + +crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17), + (13, 18)] + +mpii_halpe26 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (8, 18), + (9, 17), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_halpe26 = [ + (0, 18), + (2, 17), + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_halpe26 = [(i, i) for i in range(26)] + +ochuman_halpe26 = [(i, i) for i in range(17)] + +posetrack_halpe26 = [ + (0, 0), + (2, 17), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=ochuman_halpe26) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')] +val_evaluator = test_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py new file mode 100644 index 0000000..5c3aff2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py @@ -0,0 +1,542 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 26 +input_size = (288, 384) + +# runtime +max_epochs = 700 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 512 +val_batch_size = 64 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +# backend_args = dict(backend='local') +backend_args = dict( + backend='petrel', + path_mapping=dict({ + f'{data_root}': 's3://openmmlab/datasets/', + f'{data_root}': 's3://openmmlab/datasets/' + })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping +coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24), + (20, 21), (21, 23), (22, 25)] + +aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), + (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15), + (12, 17), (13, 18)] + +crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17), + (13, 18)] + +mpii_halpe26 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (8, 18), + (9, 17), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_halpe26 = [ + (0, 18), + (2, 17), + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_halpe26 = [(i, i) for i in range(26)] + +ochuman_halpe26 = [(i, i) for i in range(17)] + +posetrack_halpe26 = [ + (0, 0), + (2, 17), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=ochuman_halpe26) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +test_dataloader = val_dataloader + +# hooks +# default_hooks = dict( +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')] +val_evaluator = test_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py new file mode 100644 index 0000000..7890c58 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py @@ -0,0 +1,535 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 26 +input_size = (192, 256) + +# runtime +max_epochs = 700 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 1024 +val_batch_size = 64 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.6, 1.4], + rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping +coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24), + (20, 21), (21, 23), (22, 25)] + +aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), + (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15), + (12, 17), (13, 18)] + +crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17), + (13, 18)] + +mpii_halpe26 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (8, 18), + (9, 17), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_halpe26 = [ + (0, 18), + (2, 17), + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_halpe26 = [(i, i) for i in range(26)] + +ochuman_halpe26 = [(i, i) for i in range(17)] + +posetrack_halpe26 = [ + (0, 0), + (2, 17), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=ochuman_halpe26) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')] +val_evaluator = test_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py new file mode 100644 index 0000000..a229d05 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py @@ -0,0 +1,553 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 20 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-s_udp-body7_210e-256x192-8c9ccbdb_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) + +test_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) +# default_hooks = dict( +# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = [ + dict(type='PCKAccuracy', thr=0.1), + dict(type='AUC'), + dict(type='EPE') +] diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py new file mode 100644 index 0000000..3404d52 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py @@ -0,0 +1,536 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 26 +input_size = (192, 256) + +# runtime +max_epochs = 700 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 1024 +val_batch_size = 64 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-tiny_udp-body7_210e-256x192-a3775292_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.6, 1.4], + rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping +coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24), + (20, 21), (21, 23), (22, 25)] + +aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), + (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15), + (12, 17), (13, 18)] + +crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17), + (13, 18)] + +mpii_halpe26 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (8, 18), + (9, 17), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_halpe26 = [ + (0, 18), + (2, 17), + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_halpe26 = [(i, i) for i in range(26)] + +ochuman_halpe26 = [(i, i) for i in range(17)] + +posetrack_halpe26 = [ + (0, 0), + (2, 17), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=ochuman_halpe26) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + pin_memory=True, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')] +val_evaluator = test_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py new file mode 100644 index 0000000..966c545 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py @@ -0,0 +1,554 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 20 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-tiny_udp-body7_210e-256x192-a3775292_20230504.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# mapping +aic_coco = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), +] + +crowdpose_coco = [ + (0, 5), + (1, 6), + (2, 7), + (3, 8), + (4, 9), + (5, 10), + (6, 11), + (7, 12), + (8, 13), + (9, 14), + (10, 15), + (11, 16), +] + +mpii_coco = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +ochuman_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +posetrack_coco = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco) + ], +) + +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) + +test_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) +# default_hooks = dict( +# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') + +test_evaluator = [ + dict(type='PCKAccuracy', thr=0.1), + dict(type='AUC'), + dict(type='EPE') +] diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py new file mode 100644 index 0000000..f0c68e7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py @@ -0,0 +1,535 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 26 +input_size = (288, 384) + +# runtime +max_epochs = 700 +stage2_num_epochs = 20 +base_lr = 4e-3 +train_batch_size = 256 +val_batch_size = 64 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1280, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping +coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24), + (20, 21), (21, 23), (22, 25)] + +aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), + (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15), + (12, 17), (13, 18)] + +crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17), + (13, 18)] + +mpii_halpe26 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (8, 18), + (9, 17), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_halpe26 = [ + (0, 18), + (2, 17), + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_halpe26 = [(i, i) for i in range(26)] + +ochuman_halpe26 = [(i, i) for i in range(17)] + +posetrack_halpe26 = [ + (0, 0), + (2, 17), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + dataset_coco, + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_halpe, + dataset_posetrack, + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# val datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=coco_halpe26) + ], +) + +val_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_val.json', + data_prefix=dict( + img='pose/ai_challenge/ai_challenger_keypoint' + '_validation_20170911/keypoint_validation_images_20170911/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_halpe26) + ], +) + +val_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_halpe26) + ], +) + +val_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_val.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_halpe26) + ], +) + +val_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_test.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_halpe26) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_halpe26) + ], +) + +val_ochuman = dict( + type='OCHumanDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='ochuman/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + data_prefix=dict(img='pose/OCHuman/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=ochuman_halpe26) + ], +) + +val_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_val.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_halpe26) + ], +) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'), + datasets=[ + val_coco, + val_aic, + val_crowdpose, + val_mpii, + val_jhmdb, + val_halpe, + val_ochuman, + val_posetrack, + ], + pipeline=val_pipeline, + test_mode=True, + )) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')] +val_evaluator = test_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.md new file mode 100644 index 0000000..261949c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.md @@ -0,0 +1,76 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +- Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset. +- `*` denotes model trained on 7 public datasets: + - [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic) + - [MS COCO](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#coco) + - [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose) + - [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii) + - [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset) + - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe) + - [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18) +- `Body8` denotes the addition of the [OCHuman](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#ochuman) dataset, in addition to the 7 datasets mentioned above, for evaluation. + +| Config | Input Size | AP
(COCO) | PCK@0.1
(Body8) | AUC
(Body8) | EPE
(Body8) | Params(M) | FLOPS(G) | Download | +| :--------------------------------------------: | :--------: | :---------------: | :---------------------: | :-----------------: | :-----------------: | :-------: | :------: | :-----------------------------------------------: | +| [RTMPose-t\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py) | 256x192 | 65.9 | 91.44 | 63.18 | 19.45 | 3.34 | 0.36 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-body7_pt-body7_420e-256x192-026a1439_20230504.pth) | +| [RTMPose-s\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py) | 256x192 | 69.7 | 92.45 | 65.15 | 17.85 | 5.47 | 0.68 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth) | +| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py) | 256x192 | 74.9 | 94.25 | 68.59 | 15.12 | 13.59 | 1.93 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth) | +| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py) | 256x192 | 76.7 | 95.08 | 70.14 | 13.79 | 27.66 | 4.16 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-256x192-4dba18fc_20230504.pth) | +| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py) | 384x288 | 76.6 | 94.64 | 70.38 | 13.98 | 13.72 | 4.33 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth) | +| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py) | 384x288 | 78.3 | 95.36 | 71.58 | 13.08 | 27.79 | 9.35 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-384x288-3f5a1437_20230504.pth) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.yml new file mode 100644 index 0000000..768ab0c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.yml @@ -0,0 +1,98 @@ +Collections: +- Name: RTMPose + Paper: + Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose" + URL: https://arxiv.org/abs/2303.07399 + README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md +Models: +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py + In Collection: RTMPose + Metadata: + Architecture: &id001 + - RTMPose + Training Data: &id002 + - AI Challenger + - COCO + - CrowdPose + - MPII + - sub-JHMDB + - Halpe + - PoseTrack18 + Name: rtmpose-t_8xb256-420e_body8-256x192 + Results: + - Dataset: Body8 + Metrics: + AP: 0.659 + Mean@0.1: 0.914 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-body7_pt-body7_420e-256x192-026a1439_20230504.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-s_8xb256-420e_body8-256x192 + Results: + - Dataset: Body8 + Metrics: + AP: 0.697 + Mean@0.1: 0.925 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py + In Collection: RTMPose + Alias: + - human + - body + - body17 + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-m_8xb256-420e_body8-256x192 + Results: + - Dataset: Body8 + Metrics: + AP: 0.749 + Mean@0.1: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-l_8xb256-420e_body8-256x192 + Results: + - Dataset: Body8 + Metrics: + AP: 0.767 + Mean@0.1: 0.951 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-256x192-4dba18fc_20230504.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-m_8xb256-420e_body8-384x288 + Results: + - Dataset: Body8 + Metrics: + AP: 0.766 + Mean@0.1: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py + In Collection: RTMPose + Alias: rtmpose-l + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-l_8xb256-420e_body8-384x288 + Results: + - Dataset: Body8 + Metrics: + AP: 0.783 + Mean@0.1: 0.964 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-384x288-3f5a1437_20230504.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.md new file mode 100644 index 0000000..c6ab08d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.md @@ -0,0 +1,74 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+AlphaPose (TPAMI'2022) + +```bibtex +@article{alphapose, + author = {Fang, Hao-Shu and Li, Jiefeng and Tang, Hongyang and Xu, Chao and Zhu, Haoyi and Xiu, Yuliang and Li, Yong-Lu and Lu, Cewu}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + title = {AlphaPose: Whole-Body Regional Multi-Person Pose Estimation and Tracking in Real-Time}, + year = {2022} +} +``` + +
+ +- `*` denotes model trained on 7 public datasets: + - [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic) + - [MS COCO](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#coco) + - [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose) + - [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii) + - [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset) + - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe) + - [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18) +- `Body8` denotes the addition of the [OCHuman](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#ochuman) dataset, in addition to the 7 datasets mentioned above, for evaluation. + +| Config | Input Size | PCK@0.1
(Body8) | AUC
(Body8) | Params(M) | FLOPS(G) | Download | +| :--------------------------------------------------------------: | :--------: | :---------------------: | :-----------------: | :-------: | :------: | :-----------------------------------------------------------------: | +| [RTMPose-t\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py) | 256x192 | 91.89 | 66.35 | 3.51 | 0.37 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-body7_pt-body7-halpe26_700e-256x192-6020f8a6_20230605.pth) | +| [RTMPose-s\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py) | 256x192 | 93.01 | 68.62 | 5.70 | 0.70 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-body7_pt-body7-halpe26_700e-256x192-7f134165_20230605.pth) | +| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py) | 256x192 | 94.75 | 71.91 | 13.93 | 1.95 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7-halpe26_700e-256x192-4d3e73dd_20230605.pth) | +| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py) | 256x192 | 95.37 | 73.19 | 28.11 | 4.19 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7-halpe26_700e-256x192-2abb7558_20230605.pth) | +| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py) | 384x288 | 95.15 | 73.56 | 14.06 | 4.37 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7-halpe26_700e-384x288-89e6428b_20230605.pth) | +| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py) | 384x288 | 95.56 | 74.38 | 28.24 | 9.40 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7-halpe26_700e-384x288-734182ce_20230605.pth) | +| [RTMPose-x\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py) | 384x288 | 95.74 | 74.82 | 50.00 | 17.29 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-x_simcc-body7_pt-body7-halpe26_700e-384x288-7fb6e239_20230606.pth) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.yml new file mode 100644 index 0000000..aca2527 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.yml @@ -0,0 +1,107 @@ +Collections: +- Name: RTMPose + Paper: + Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose" + URL: https://arxiv.org/abs/2303.07399 + README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md +Models: +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py + In Collection: RTMPose + Metadata: + Architecture: &id001 + - RTMPose + Training Data: &id002 + - AI Challenger + - COCO + - CrowdPose + - MPII + - sub-JHMDB + - Halpe + - PoseTrack18 + Name: rtmpose-t_8xb1024-700e_body8-halpe26-256x192 + Results: + - Dataset: Body8 + Metrics: + Mean@0.1: 0.919 + AUC: 0.664 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-body7_pt-body7-halpe26_700e-256x192-6020f8a6_20230605.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-s_8xb1024-700e_body8-halpe26-256x192 + Results: + - Dataset: Body8 + Metrics: + Mean@0.1: 0.930 + AUC: 0.682 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-body7_pt-body7-halpe26_700e-256x192-7f134165_20230605.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py + In Collection: RTMPose + Alias: body26 + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-m_8xb512-700e_body8-halpe26-256x192 + Results: + - Dataset: Body8 + Metrics: + Mean@0.1: 0.947 + AUC: 0.719 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7-halpe26_700e-256x192-4d3e73dd_20230605.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-l_8xb512-700e_body8-halpe26-256x192 + Results: + - Dataset: Body8 + Metrics: + Mean@0.1: 0.954 + AUC: 0.732 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7-halpe26_700e-256x192-2abb7558_20230605.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-m_8xb512-700e_body8-halpe26-384x288 + Results: + - Dataset: Body8 + Metrics: + Mean@0.1: 0.952 + AUC: 0.736 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7-halpe26_700e-384x288-89e6428b_20230605.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-l_8xb512-700e_body8-halpe26-384x288 + Results: + - Dataset: Body8 + Metrics: + Mean@0.1: 0.956 + AUC: 0.744 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7-halpe26_700e-384x288-734182ce_20230605.pth +- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-x_8xb256-700e_body8-halpe26-384x288 + Results: + - Dataset: Body8 + Metrics: + Mean@0.1: 0.957 + AUC: 0.748 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-x_simcc-body7_pt-body7-halpe26_700e-384x288-7fb6e239_20230606.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000..e1fda25 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py new file mode 100644 index 0000000..96be86d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000..0d354d3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000..24b70dd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=128 * 2, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py new file mode 100644 index 0000000..7cb0e23 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=128 * 2, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000..d0b2325 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000..635f8c9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=128 * 2, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000..ee4f990 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000..dde95b4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,273 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # Turn off EMA while training the tiny model + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000..d4d8180 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py @@ -0,0 +1,233 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # Turn off EMA while training the tiny model + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.md new file mode 100644 index 0000000..312a36b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.md @@ -0,0 +1,71 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [rtmpose-t](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 0.682 | 0.883 | 0.759 | 0.736 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.json) | +| [rtmpose-s](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.716 | 0.892 | 0.789 | 0.768 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.json) | +| [rtmpose-m](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 0.746 | 0.899 | 0.817 | 0.795 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.json) | +| [rtmpose-l](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 0.758 | 0.906 | 0.826 | 0.806 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.json) | +| [rtmpose-t-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.685 | 0.880 | 0.761 | 0.738 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.json) | +| [rtmpose-s-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.722 | 0.892 | 0.794 | 0.772 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.json) | +| [rtmpose-m-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.758 | 0.903 | 0.826 | 0.806 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.json) | +| [rtmpose-l-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.765 | 0.906 | 0.835 | 0.813 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.json) | +| [rtmpose-m-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py) | 384x288 | 0.770 | 0.908 | 0.833 | 0.816 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.json) | +| [rtmpose-l-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py) | 384x288 | 0.773 | 0.907 | 0.835 | 0.819 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.yml new file mode 100644 index 0000000..caea707 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.yml @@ -0,0 +1,170 @@ +Collections: +- Name: RTMPose + Paper: + Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose" + URL: https://arxiv.org/abs/2303.07399 + README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md +Models: +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py + In Collection: RTMPose + Metadata: + Architecture: &id001 + - RTMPose + Training Data: COCO + Name: rtmpose-t_8xb256-420e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.682 + AP@0.5: 0.883 + AP@0.75: 0.759 + AR: 0.736 + AR@0.5: 0.92 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: rtmpose-s_8xb256-420e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.716 + AP@0.5: 0.892 + AP@0.75: 0.789 + AR: 0.768 + AR@0.5: 0.929 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: rtmpose-m_8xb256-420e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.746 + AP@0.5: 0.899 + AP@0.75: 0.817 + AR: 0.795 + AR@0.5: 0.935 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: rtmpose-l_8xb256-420e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.758 + AP@0.5: 0.906 + AP@0.75: 0.826 + AR: 0.806 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: &id002 + - COCO + - AI Challenger + Name: rtmpose-t_8xb256-420e_aic-coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.685 + AP@0.5: 0.88 + AP@0.75: 0.761 + AR: 0.738 + AR@0.5: 0.918 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-s_8xb256-420e_aic-coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.722 + AP@0.5: 0.892 + AP@0.75: 0.794 + AR: 0.772 + AR@0.5: 0.929 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-m_8xb256-420e_aic-coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.758 + AP@0.5: 0.903 + AP@0.75: 0.826 + AR: 0.806 + AR@0.5: 0.94 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-l_8xb256-420e_aic-coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.765 + AP@0.5: 0.906 + AP@0.75: 0.835 + AR: 0.813 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-m_8xb256-420e_aic-coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.770 + AP@0.5: 0.908 + AP@0.75: 0.833 + AR: 0.816 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.pth +- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-l_8xb256-420e_aic-coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.773 + AP@0.5: 0.907 + AP@0.75: 0.835 + AR: 0.819 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py new file mode 100644 index 0000000..9ff68f5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py @@ -0,0 +1,234 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 5e-4 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=14, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='crowdpose/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'crowdpose/annotations/mmpose_crowdpose_test.json', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.md new file mode 100644 index 0000000..9314142 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.md @@ -0,0 +1,60 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector + +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [rtmpose-m](/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.706 | 0.841 | 0.765 | 0.799 | 0.719 | 0.582 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-crowdpose_pt-aic-coco_210e-256x192-e6192cac_20230224.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-crowdpose_pt-aic-coco_210e-256x192-e6192cac_20230224.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.yml new file mode 100644 index 0000000..ddfe25f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.yml @@ -0,0 +1,19 @@ +Models: +- Config: configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py + In Collection: RTMPose + Metadata: + Architecture: + - RTMPose + Training Data: CrowdPose + Name: rtmpose-t_8xb256-420e_coco-256x192 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.706 + AP@0.5: 0.841 + AP@0.75: 0.765 + AP (E): 0.799 + AP (M): 0.719 + AP (L): 0.582 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-crowdpose_pt-aic-coco_210e-256x192-e6192cac_20230224.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py new file mode 100644 index 0000000..8ac425a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + # bbox_file=f'{data_root}HumanArt/person_detection_results/' + # 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py new file mode 100644 index 0000000..83a2c44 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + # bbox_file=f'{data_root}HumanArt/person_detection_results/' + # 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py new file mode 100644 index 0000000..87bd833 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + # bbox_file=f'{data_root}HumanArt/person_detection_results/' + # 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py new file mode 100644 index 0000000..e5a8092 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py @@ -0,0 +1,233 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + # bbox_file=f'{data_root}HumanArt/person_detection_results/' + # 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # Turn off EMA while training the tiny model + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.md new file mode 100644 index 0000000..adc2bbd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.md @@ -0,0 +1,117 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +
+Human-Art (CVPR'2023) + +```bibtex +@inproceedings{ju2023humanart, + title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes}, + author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), + year={2023}} +``` + +
+ +Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [rtmpose-t-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 0.161 | 0.283 | 0.154 | 0.221 | 0.373 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.json) | +| [rtmpose-t-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py) | 256x192 | 0.249 | 0.395 | 0.256 | 0.323 | 0.485 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.json) | +| [rtmpose-s-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.199 | 0.328 | 0.198 | 0.261 | 0.418 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.json) | +| [rtmpose-s-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py) | 256x192 | 0.311 | 0.462 | 0.323 | 0.381 | 0.540 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.json) | +| [rtmpose-m-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 0.239 | 0.372 | 0.243 | 0.302 | 0.455 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.json) | +| [rtmpose-m-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py) | 256x192 | 0.355 | 0.503 | 0.377 | 0.417 | 0.568 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.json) | +| [rtmpose-l-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 0.260 | 0.393 | 0.267 | 0.323 | 0.472 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.json) | +| [rtmpose-l-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py) | 256x192 | 0.378 | 0.521 | 0.399 | 0.442 | 0.584 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.json) | + +Results on Human-Art validation dataset with ground-truth bounding-box + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [rtmpose-t-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 0.444 | 0.725 | 0.453 | 0.488 | 0.750 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.json) | +| [rtmpose-t-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py) | 256x192 | 0.655 | 0.872 | 0.720 | 0.693 | 0.890 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.json) | +| [rtmpose-s-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.480 | 0.739 | 0.498 | 0.521 | 0.763 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.json) | +| [rtmpose-s-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py) | 256x192 | 0.698 | 0.893 | 0.768 | 0.732 | 0.903 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.json) | +| [rtmpose-m-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 0.532 | 0.765 | 0.563 | 0.571 | 0.789 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.json) | +| [rtmpose-m-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py) | 256x192 | 0.728 | 0.895 | 0.791 | 0.759 | 0.906 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.json) | +| [rtmpose-l-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 0.564 | 0.789 | 0.602 | 0.599 | 0.808 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.json) | +| [rtmpose-l-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py) | 256x192 | 0.753 | 0.905 | 0.812 | 0.783 | 0.915 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.json) | + +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [rtmpose-t-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 0.682 | 0.883 | 0.759 | 0.736 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.json) | +| [rtmpose-t-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py) | 256x192 | 0.665 | 0.875 | 0.739 | 0.721 | 0.916 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.json) | +| [rtmpose-s-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.716 | 0.892 | 0.789 | 0.768 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.json) | +| [rtmpose-s-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py) | 256x192 | 0.706 | 0.888 | 0.780 | 0.759 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.json) | +| [rtmpose-m-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 0.746 | 0.899 | 0.817 | 0.795 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.json) | +| [rtmpose-m-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py) | 256x192 | 0.725 | 0.892 | 0.795 | 0.775 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.json) | +| [rtmpose-l-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 0.758 | 0.906 | 0.826 | 0.806 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.json) | +| [rtmpose-l-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py) | 256x192 | 0.748 | 0.901 | 0.816 | 0.796 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.json) | + +Results on COCO val2017 with ground-truth bounding box + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [rtmpose-t-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py) | 256x192 | 0.679 | 0.895 | 0.755 | 0.710 | 0.907 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.json) | +| [rtmpose-s-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py) | 256x192 | 0.725 | 0.916 | 0.798 | 0.753 | 0.925 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.json) | +| [rtmpose-m-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py) | 256x192 | 0.744 | 0.916 | 0.818 | 0.770 | 0.930 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.json) | +| [rtmpose-l-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py) | 256x192 | 0.770 | 0.927 | 0.840 | 0.794 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.yml new file mode 100644 index 0000000..aaabbcd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.yml @@ -0,0 +1,138 @@ +Collections: +- Name: RTMPose + Paper: + Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose" + URL: https://arxiv.org/abs/2303.07399 + README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md +Models: +- Config: configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py + In Collection: RTMPose + Metadata: + Architecture: &id001 + - RTMPose + Training Data: &id002 + - COCO + - Human-Art + Name: rtmpose-l_8xb256-420e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.748 + AP@0.5: 0.901 + AP@0.75: 0.816 + AR: 0.796 + AR@0.5: 0.938 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.378 + AP@0.5: 0.521 + AP@0.75: 0.399 + AR: 0.442 + AR@0.5: 0.584 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.753 + AP@0.5: 0.905 + AP@0.75: 0.812 + AR: 0.783 + AR@0.5: 0.915 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth +- Config: configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-m_8xb256-420e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.725 + AP@0.5: 0.892 + AP@0.75: 0.795 + AR: 0.775 + AR@0.5: 0.929 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.355 + AP@0.5: 0.503 + AP@0.75: 0.377 + AR: 0.417 + AR@0.5: 0.568 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.728 + AP@0.5: 0.895 + AP@0.75: 0.791 + AR: 0.759 + AR@0.5: 0.906 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth +- Config: configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-s_8xb256-420e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.706 + AP@0.5: 0.888 + AP@0.75: 0.780 + AR: 0.759 + AR@0.5: 0.928 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.311 + AP@0.5: 0.462 + AP@0.75: 0.323 + AR: 0.381 + AR@0.5: 0.540 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.698 + AP@0.5: 0.893 + AP@0.75: 0.768 + AR: 0.732 + AR@0.5: 0.903 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth +- Config: configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-t_8xb256-420e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.665 + AP@0.5: 0.875 + AP@0.75: 0.739 + AR: 0.721 + AR@0.5: 0.916 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.249 + AP@0.5: 0.395 + AP@0.75: 0.256 + AR: 0.323 + AR@0.5: 0.485 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.655 + AP@0.5: 0.872 + AP@0.75: 0.720 + AR: 0.693 + AR@0.5: 0.890 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..3f5c6af --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py @@ -0,0 +1,228 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=16, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/', +# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file=f'{data_root}/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.md new file mode 100644 index 0000000..b8ffd12 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.md @@ -0,0 +1,43 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean / w. flip | Mean@0.1 | ckpt | log | +| :------------------------------------------------------- | :--------: | :------------: | :------: | :------------------------------------------------------: | :------------------------------------------------------: | +| [rtmpose-m](/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py) | 256x256 | 0.907 | 0.348 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-mpii_pt-aic-coco_210e-256x256-ec4dbec8_20230206.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-mpii_pt-aic-coco_210e-256x256-ec4dbec8_20230206.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.yml new file mode 100644 index 0000000..7ff95d0 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.yml @@ -0,0 +1,15 @@ +Models: +- Config: configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py + In Collection: RTMPose + Metadata: + Architecture: + - RTMPose + Training Data: MPII + Name: rtmpose-m_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.907 + Mean@0.1: 0.348 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-mpii_pt-aic-coco_210e-256x256-ec4dbec8_20230206.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/README.md b/modules/rtmpose/configs/body_2d_keypoint/simcc/README.md new file mode 100644 index 0000000..6d377d0 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/README.md @@ -0,0 +1,20 @@ +# Top-down SimCC-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, SimCC based methods reformulate human pose estimation as two classification tasks for horizontal and vertical coordinates, and uniformly divide each pixel into several bins, thus obtain the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation](https://arxiv.org/abs/2107.03332). + +
+ +
+ +## Results and Models + +### COCO Dataset + +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Model | Input Size | AP | AR | Details and Download | +| :---------------------------: | :--------: | :---: | :---: | :-----------------------------------------------: | +| ResNet-50+SimCC | 384x288 | 0.735 | 0.790 | [resnet_coco.md](./coco/resnet_coco.md) | +| ResNet-50+SimCC | 256x192 | 0.721 | 0.781 | [resnet_coco.md](./coco/resnet_coco.md) | +| S-ViPNAS-MobileNet-V3+SimCC | 256x192 | 0.695 | 0.755 | [vipnas_coco.md](./coco/vipnas_coco.md) | +| MobileNet-V2+SimCC(wo/deconv) | 256x192 | 0.620 | 0.678 | [mobilenetv2_coco.md](./coco/mobilenetv2_coco.md) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.md b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.md new file mode 100644 index 0000000..e6d5b72 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.md @@ -0,0 +1,55 @@ + + +
+SimCC (ECCV'2022) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2107.03332, + title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation}, + author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao}, + year={2021} +} +``` + +
+ + + +
+MobilenetV2 (CVPR'2018) + +```bibtex +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [simcc_mobilenetv2_wo_deconv](/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py) | 256x192 | 0.620 | 0.855 | 0.697 | 0.678 | 0.902 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192-4b0703bb_20221010.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192-4b0703bb_20221010.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml new file mode 100644 index 0000000..a72c85f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml @@ -0,0 +1,19 @@ +Models: +- Config: configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py + In Collection: SimCC + Metadata: + Architecture: &id001 + - SimCC + - MobilenetV2 + Training Data: COCO + Name: simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.62 + AP@0.5: 0.855 + AP@0.75: 0.697 + AR: 0.678 + AR@0.5: 0.902 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192-4b0703bb_20221010.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.md new file mode 100644 index 0000000..16b5eb0 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.md @@ -0,0 +1,56 @@ + + +
+SimCC (ECCV'2022) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2107.03332, + title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation}, + author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao}, + year={2021} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [simcc_resnet_50](/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.721 | 0.897 | 0.798 | 0.781 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192-8e0f5b59_20220919.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192-8e0f5b59_20220919.log.json) | +| [simcc_resnet_50](/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py) | 384x288 | 0.735 | 0.899 | 0.800 | 0.790 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288-45c3ba34_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288-45c3ba34_20220913.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.yml new file mode 100644 index 0000000..04c0aca --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.yml @@ -0,0 +1,41 @@ +Collections: +- Name: SimCC + Paper: + Title: A Simple Coordinate Classification Perspective for Human Pose Estimation + URL: https://arxiv.org/abs/2107.03332 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/simcc.md +Models: +- Config: configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py + In Collection: SimCC + Metadata: + Architecture: &id001 + - SimCC + - ResNet + Training Data: COCO + Name: simcc_res50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.721 + AP@0.5: 0.900 + AP@0.75: 0.798 + AR: 0.781 + AR@0.5: 0.937 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192-8e0f5b59_20220919.pth +- Config: configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py + In Collection: SimCC + Metadata: + Architecture: *id001 + Training Data: COCO + Name: simcc_res50_8xb32-140e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.735 + AP@0.5: 0.899 + AP@0.75: 0.800 + AR: 0.790 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288-45c3ba34_20220913.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..1e5f62a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', input_size=(192, 256), sigma=6.0, simcc_split_ratio=2.0) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MobileNetV2', + widen_factor=1., + out_indices=(7, ), + init_cfg=dict( + type='Pretrained', + checkpoint='mmcls://mobilenet_v2', + )), + head=dict( + type='SimCCHead', + in_channels=1280, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + deconv_out_channels=None, + loss=dict(type='KLDiscretLoss', use_target_weight=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py new file mode 100644 index 0000000..c724f83 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=140, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[90, 120], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', input_size=(288, 384), sigma=6.0, simcc_split_ratio=2.0) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='SimCCHead', + in_channels=2048, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + loss=dict(type='KLDiscretLoss', use_target_weight=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +test_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..2a08ff9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py @@ -0,0 +1,114 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict(type='MultiStepLR', milestones=[170, 200], gamma=0.1, by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', input_size=(192, 256), sigma=6.0, simcc_split_ratio=2.0) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='SimCCHead', + in_channels=2048, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + loss=dict(type='KLDiscretLoss', use_target_weight=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +test_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..cd2fe16 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py @@ -0,0 +1,119 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', input_size=(192, 256), sigma=6.0, simcc_split_ratio=2.0) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ViPNAS_MobileNetV3'), + head=dict( + type='SimCCHead', + in_channels=160, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + deconv_type='vipnas', + deconv_out_channels=(160, 160, 160), + deconv_num_groups=(160, 160, 160), + loss=dict(type='KLDiscretLoss', use_target_weight=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=data_root + 'person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.md b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.md new file mode 100644 index 0000000..4d36b73 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.md @@ -0,0 +1,54 @@ + + +
+SimCC (ECCV'2022) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2107.03332, + title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation}, + author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao}, + year={2021} +} +``` + +
+ + + +
+ViPNAS (CVPR'2021) + +```bibtex +@article{xu2021vipnas, + title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search}, + author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + year={2021} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [simcc_S-ViPNAS-MobileNetV3](/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py) | 256x192 | 0.695 | 0.883 | 0.772 | 0.755 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192-719f3489_20220922.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192-719f3489_20220922.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml new file mode 100644 index 0000000..3790a10 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml @@ -0,0 +1,19 @@ +Models: +- Config: configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py + In Collection: SimCC + Metadata: + Architecture: &id001 + - SimCC + - ViPNAS + Training Data: COCO + Name: simcc_vipnas-mbv3_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.695 + AP@0.5: 0.883 + AP@0.75: 0.772 + AR: 0.755 + AR@0.5: 0.927 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192-719f3489_20220922.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..74a43d5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', input_size=(256, 256), sigma=6.0, simcc_split_ratio=2.0) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='SimCCHead', + in_channels=2048, + out_channels=16, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + deconv_out_channels=None, + loss=dict(type='KLDiscretLoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file=f'{data_root}/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/README.md new file mode 100644 index 0000000..f0b54aa --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/README.md @@ -0,0 +1,133 @@ +# Top-down heatmap-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html). + +
+ +
+ +## Results and Models + +### COCO Dataset + +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Model | Input Size | AP | AR | Details and Download | +| :-------------: | :--------: | :---: | :---: | :-------------------------------------------------: | +| ViTPose-h | 256x192 | 0.790 | 0.840 | [vitpose_coco.md](./coco/vitpose_coco.md) | +| HRNet-w48+UDP | 256x192 | 0.768 | 0.817 | [hrnet_udp_coco.md](./coco/hrnet_udp_coco.md) | +| MSPN 4-stg | 256x192 | 0.765 | 0.826 | [mspn_coco.md](./coco/mspn_coco.md) | +| HRNet-w48+Dark | 256x192 | 0.764 | 0.814 | [hrnet_dark_coco.md](./coco/hrnet_dark_coco.md) | +| HRNet-w48 | 256x192 | 0.756 | 0.809 | [hrnet_coco.md](./coco/hrnet_coco.md) | +| HRFormer-B | 256x192 | 0.754 | 0.807 | [hrformer_coco.md](./coco/hrformer_coco.md) | +| RSN-50-3x | 256x192 | 0.750 | 0.814 | [rsn_coco.md](./coco/rsn_coco.md) | +| CSPNeXt-l | 256x192 | 0.750 | 0.800 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) | +| HRNet-w32 | 256x192 | 0.749 | 0.804 | [hrnet_coco.md](./coco/hrnet_coco.md) | +| Swin-L | 256x192 | 0.743 | 0.798 | [swin_coco.md](./coco/swin_coco.md) | +| ViTPose-s | 256x192 | 0.739 | 0.792 | [vitpose_coco.md](./coco/vitpose_coco.md) | +| HRFormer-S | 256x192 | 0.738 | 0.793 | [hrformer_coco.md](./coco/hrformer_coco.md) | +| Swin-B | 256x192 | 0.737 | 0.794 | [swin_coco.md](./coco/swin_coco.md) | +| SEResNet-101 | 256x192 | 0.734 | 0.790 | [seresnet_coco.md](./coco/seresnet_coco.md) | +| SCNet-101 | 256x192 | 0.733 | 0.789 | [scnet_coco.md](./coco/scnet_coco.md) | +| ResNet-101+Dark | 256x192 | 0.733 | 0.786 | [resnet_dark_coco.md](./coco/resnet_dark_coco.md) | +| CSPNeXt-m | 256x192 | 0.732 | 0.785 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) | +| ResNetV1d-101 | 256x192 | 0.732 | 0.785 | [resnetv1d_coco.md](./coco/resnetv1d_coco.md) | +| SEResNet-50 | 256x192 | 0.729 | 0.784 | [seresnet_coco.md](./coco/seresnet_coco.md) | +| SCNet-50 | 256x192 | 0.728 | 0.784 | [scnet_coco.md](./coco/scnet_coco.md) | +| ResNet-101 | 256x192 | 0.726 | 0.783 | [resnet_coco.md](./coco/resnet_coco.md) | +| ResNeXt-101 | 256x192 | 0.726 | 0.781 | [resnext_coco.md](./coco/resnext_coco.md) | +| HourglassNet | 256x256 | 0.726 | 0.780 | [hourglass_coco.md](./coco/hourglass_coco.md) | +| ResNeSt-101 | 256x192 | 0.725 | 0.781 | [resnest_coco.md](./coco/resnest_coco.md) | +| RSN-50 | 256x192 | 0.724 | 0.790 | [rsn_coco.md](./coco/rsn_coco.md) | +| Swin-T | 256x192 | 0.724 | 0.782 | [swin_coco.md](./coco/swin_coco.md) | +| MSPN 1-stg | 256x192 | 0.723 | 0.788 | [mspn_coco.md](./coco/mspn_coco.md) | +| ResNetV1d-50 | 256x192 | 0.722 | 0.777 | [resnetv1d_coco.md](./coco/resnetv1d_coco.md) | +| ResNeSt-50 | 256x192 | 0.720 | 0.775 | [resnest_coco.md](./coco/resnest_coco.md) | +| ResNet-50 | 256x192 | 0.718 | 0.774 | [resnet_coco.md](./coco/resnet_coco.md) | +| ResNeXt-50 | 256x192 | 0.715 | 0.771 | [resnext_coco.md](./coco/resnext_coco.md) | +| PVT-S | 256x192 | 0.714 | 0.773 | [pvt_coco.md](./coco/pvt_coco.md) | +| CSPNeXt-s | 256x192 | 0.697 | 0.753 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) | +| LiteHRNet-30 | 256x192 | 0.676 | 0.736 | [litehrnet_coco.md](./coco/litehrnet_coco.md) | +| CSPNeXt-tiny | 256x192 | 0.665 | 0.723 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) | +| MobileNet-v2 | 256x192 | 0.648 | 0.709 | [mobilenetv2_coco.md](./coco/mobilenetv2_coco.md) | +| LiteHRNet-18 | 256x192 | 0.642 | 0.705 | [litehrnet_coco.md](./coco/litehrnet_coco.md) | +| CPM | 256x192 | 0.627 | 0.689 | [cpm_coco.md](./coco/cpm_coco.md) | +| ShuffleNet-v2 | 256x192 | 0.602 | 0.668 | [shufflenetv2_coco.md](./coco/shufflenetv2_coco.md) | +| ShuffleNet-v1 | 256x192 | 0.587 | 0.654 | [shufflenetv1_coco.md](./coco/shufflenetv1_coco.md) | +| AlexNet | 256x192 | 0.448 | 0.521 | [alexnet_coco.md](./coco/alexnet_coco.md) | + +### MPII Dataset + +| Model | Input Size | PCKh@0.5 | PCKh@0.1 | Details and Download | +| :------------: | :--------: | :------: | :------: | :-------------------------------------------------: | +| HRNet-w48+Dark | 256x256 | 0.905 | 0.360 | [hrnet_dark_mpii.md](./mpii/hrnet_dark_mpii.md) | +| HRNet-w48 | 256x256 | 0.902 | 0.303 | [hrnet_mpii.md](./mpii/cspnext_udp_mpii.md) | +| HRNet-w48 | 256x256 | 0.901 | 0.337 | [hrnet_mpii.md](./mpii/hrnet_mpii.md) | +| HRNet-w32 | 256x256 | 0.900 | 0.334 | [hrnet_mpii.md](./mpii/hrnet_mpii.md) | +| HourglassNet | 256x256 | 0.889 | 0.317 | [hourglass_mpii.md](./mpii/hourglass_mpii.md) | +| ResNet-152 | 256x256 | 0.889 | 0.303 | [resnet_mpii.md](./mpii/resnet_mpii.md) | +| ResNetV1d-152 | 256x256 | 0.888 | 0.300 | [resnetv1d_mpii.md](./mpii/resnetv1d_mpii.md) | +| SCNet-50 | 256x256 | 0.888 | 0.290 | [scnet_mpii.md](./mpii/scnet_mpii.md) | +| ResNeXt-152 | 256x256 | 0.887 | 0.294 | [resnext_mpii.md](./mpii/resnext_mpii.md) | +| SEResNet-50 | 256x256 | 0.884 | 0.292 | [seresnet_mpii.md](./mpii/seresnet_mpii.md) | +| ResNet-50 | 256x256 | 0.882 | 0.286 | [resnet_mpii.md](./mpii/resnet_mpii.md) | +| ResNetV1d-50 | 256x256 | 0.881 | 0.290 | [resnetv1d_mpii.md](./mpii/resnetv1d_mpii.md) | +| CPM | 368x368\* | 0.876 | 0.285 | [cpm_mpii.md](./mpii/cpm_mpii.md) | +| LiteHRNet-30 | 256x256 | 0.869 | 0.271 | [litehrnet_mpii.md](./mpii/litehrnet_mpii.md) | +| LiteHRNet-18 | 256x256 | 0.859 | 0.260 | [litehrnet_mpii.md](./mpii/litehrnet_mpii.md) | +| MobileNet-v2 | 256x256 | 0.854 | 0.234 | [mobilenetv2_mpii.md](./mpii/mobilenetv2_mpii.md) | +| ShuffleNet-v2 | 256x256 | 0.828 | 0.205 | [shufflenetv2_mpii.md](./mpii/shufflenetv2_mpii.md) | +| ShuffleNet-v1 | 256x256 | 0.824 | 0.195 | [shufflenetv1_mpii.md](./mpii/shufflenetv1_mpii.md) | + +### CrowdPose Dataset + +Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector + +| Model | Input Size | AP | AR | Details and Download | +| :--------: | :--------: | :---: | :---: | :--------------------------------------------------------: | +| HRNet-w32 | 256x192 | 0.675 | 0.816 | [hrnet_crowdpose.md](./crowdpose/hrnet_crowdpose.md) | +| CSPNeXt-m | 256x192 | 0.662 | 0.755 | [hrnet_crowdpose.md](./crowdpose/cspnext_udp_crowdpose.md) | +| ResNet-101 | 256x192 | 0.647 | 0.800 | [resnet_crowdpose.md](./crowdpose/resnet_crowdpose.md) | +| HRNet-w32 | 256x192 | 0.637 | 0.785 | [resnet_crowdpose.md](./crowdpose/resnet_crowdpose.md) | + +### AIC Dataset + +Results on AIC val set with ground-truth bounding boxes. + +| Model | Input Size | AP | AR | Details and Download | +| :--------: | :--------: | :---: | :---: | :----------------------------------: | +| HRNet-w32 | 256x192 | 0.323 | 0.366 | [hrnet_aic.md](./aic/hrnet_aic.md) | +| ResNet-101 | 256x192 | 0.294 | 0.337 | [resnet_aic.md](./aic/resnet_aic.md) | + +### JHMDB Dataset + +| Model | Input Size | PCK(norm. by person size) | PCK (norm. by torso size) | Details and Download | +| :-------: | :--------: | :-----------------------: | :-----------------------: | :----------------------------------------: | +| ResNet-50 | 256x256 | 96.0 | 80.1 | [resnet_jhmdb.md](./jhmdb/resnet_jhmdb.md) | +| CPM | 368x368 | 89.8 | 65.7 | [cpm_jhmdb.md](./jhmdb/cpm_jhmdb.md) | + +### PoseTrack2018 Dataset + +Results on PoseTrack2018 val with ground-truth bounding boxes. + +| Model | Input Size | AP | Details and Download | +| :-------: | :--------: | :--: | :----------------------------------------------------------: | +| HRNet-w48 | 256x192 | 84.6 | [hrnet_posetrack18.md](./posetrack18/hrnet_posetrack18.md) | +| HRNet-w32 | 256x192 | 83.4 | [hrnet_posetrack18.md](./posetrack18/hrnet_posetrack18.md) | +| ResNet-50 | 256x192 | 81.2 | [resnet_posetrack18.md](./posetrack18/resnet_posetrack18.md) | + +### Human-Art Dataset + +Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset + +| Model | Input Size | AP | AR | Details and Download | +| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: | +| ViTPose-s | 256x192 | 0.381 | 0.448 | [vitpose_humanart.md](./humanart/vitpose_humanart.md) | +| ViTPose-b | 256x192 | 0.410 | 0.475 | [vitpose_humanart.md](./humanart/vitpose_humanart.md) | + +Results on Human-Art validation dataset with ground-truth bounding-box + +| Model | Input Size | AP | AR | Details and Download | +| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: | +| ViTPose-s | 256x192 | 0.738 | 0.768 | [vitpose_humanart.md](./humanart/vitpose_humanart.md) | +| ViTPose-b | 256x192 | 0.759 | 0.790 | [vitpose_humanart.md](./humanart/vitpose_humanart.md) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.md new file mode 100644 index 0000000..282bac8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.md @@ -0,0 +1,38 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+AI Challenger (ArXiv'2017) + +```bibtex +@article{wu2017ai, + title={Ai challenger: A large-scale dataset for going deeper in image understanding}, + author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others}, + journal={arXiv preprint arXiv:1711.06475}, + year={2017} +} +``` + +
+ +Results on AIC val set with ground-truth bounding boxes + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py) | 256x192 | 0.323 | 0.761 | 0.218 | 0.366 | 0.789 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_aic_256x192-30a4e465_20200826.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_aic_256x192_20200826.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.yml new file mode 100644 index 0000000..9550a4f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.yml @@ -0,0 +1,18 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py + In Collection: HRNet + Metadata: + Architecture: + - HRNet + Training Data: AI Challenger + Name: td-hm_hrnet-w32_8xb64-210e_aic-256x192 + Results: + - Dataset: AI Challenger + Metrics: + AP: 0.323 + AP@0.5: 0.761 + AP@0.75: 0.218 + AR: 0.366 + AR@0.5: 0.789 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_aic_256x192-30a4e465_20200826.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.md new file mode 100644 index 0000000..f4a457d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.md @@ -0,0 +1,55 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+AI Challenger (ArXiv'2017) + +```bibtex +@article{wu2017ai, + title={Ai challenger: A large-scale dataset for going deeper in image understanding}, + author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others}, + journal={arXiv preprint arXiv:1711.06475}, + year={2017} +} +``` + +
+ +Results on AIC val set with ground-truth bounding boxes + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py) | 256x192 | 0.294 | 0.736 | 0.172 | 0.337 | 0.762 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_aic_256x192-79b35445_20200826.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_aic_256x192_20200826.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.yml new file mode 100644 index 0000000..9c89bc3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.yml @@ -0,0 +1,19 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - ResNet + Training Data: AI Challenger + Name: td-hm_res101_8xb64-210e_aic-256x192 + Results: + - Dataset: AI Challenger + Metrics: + AP: 0.294 + AP@0.5: 0.736 + AP@0.75: 0.172 + AR: 0.337 + AR@0.5: 0.762 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_aic_256x192-79b35445_20200826.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py new file mode 100644 index 0000000..1c6d8c5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py @@ -0,0 +1,151 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=14, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AicDataset' +data_mode = 'topdown' +data_root = 'data/aic/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/aic_val.json', + data_prefix=dict(img='ai_challenger_keypoint_validation_20170911/' + 'keypoint_validation_images_20170911/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/aic_val.json', + use_area=False) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py new file mode 100644 index 0000000..d368730 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=14, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AicDataset' +data_mode = 'topdown' +data_root = 'data/aic/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/aic_val.json', + data_prefix=dict(img='ai_challenger_keypoint_validation_20170911/' + 'keypoint_validation_images_20170911/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/aic_val.json', + use_area=False) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.md new file mode 100644 index 0000000..4cff144 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.md @@ -0,0 +1,40 @@ + + +
+AlexNet (NeurIPS'2012) + +```bibtex +@inproceedings{krizhevsky2012imagenet, + title={Imagenet classification with deep convolutional neural networks}, + author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, + booktitle={Advances in neural information processing systems}, + pages={1097--1105}, + year={2012} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_alexnet](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py) | 256x192 | 0.448 | 0.767 | 0.461 | 0.521 | 0.829 | [ckpt](https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192-a7b1fd15_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192_20200727.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.yml new file mode 100644 index 0000000..0451088 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.yml @@ -0,0 +1,19 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - AlexNet + Training Data: COCO + Name: td-hm_alexnet_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.448 + AP@0.5: 0.767 + AP@0.75: 0.461 + AR: 0.521 + AR@0.5: 0.829 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192-a7b1fd15_20200727.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md new file mode 100644 index 0000000..c0ecaad --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md @@ -0,0 +1,41 @@ + + +
+CPM (CVPR'2016) + +```bibtex +@inproceedings{wei2016convolutional, + title={Convolutional pose machines}, + author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser}, + booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition}, + pages={4724--4732}, + year={2016} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [cpm](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py) | 256x192 | 0.627 | 0.862 | 0.709 | 0.689 | 0.906 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192-0e978875_20220920.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192_20220920.log) | +| [cpm](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py) | 384x288 | 0.652 | 0.865 | 0.730 | 0.710 | 0.907 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288-165487b8_20221011.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288_20221011.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.yml new file mode 100644 index 0000000..aee822b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.yml @@ -0,0 +1,40 @@ +Collections: +- Name: CPM + Paper: + Title: Convolutional pose machines + URL: http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/cpm.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py + In Collection: CPM + Metadata: + Architecture: &id001 + - CPM + Training Data: COCO + Name: td-hm_cpm_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.627 + AP@0.5: 0.862 + AP@0.75: 0.709 + AR: 0.689 + AR@0.5: 0.906 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192-0e978875_20220920.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py + In Collection: CPM + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_cpm_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.652 + AP@0.5: 0.865 + AP@0.75: 0.730 + AR: 0.710 + AR@0.5: 0.907 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288-165487b8_20221011.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py new file mode 100644 index 0000000..db92dac --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py @@ -0,0 +1,284 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=19, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000..a6f6239 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py new file mode 100644 index 0000000..d51d467 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py @@ -0,0 +1,284 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=19, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000..a1dd5f6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py new file mode 100644 index 0000000..323a9e8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py @@ -0,0 +1,284 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-s_imagenet_600e-ea671761.pth')), + head=dict( + type='HeatmapHead', + in_channels=512, + out_channels=19, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000..918b2fa --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-s_imagenet_600e-ea671761.pth')), + head=dict( + type='HeatmapHead', + in_channels=512, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py new file mode 100644 index 0000000..e25d29b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py @@ -0,0 +1,284 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-tiny_imagenet_600e-3a2dd350.pth')), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=19, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000..576c3be --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-tiny_imagenet_600e-3a2dd350.pth')), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.md new file mode 100644 index 0000000..29fd080 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.md @@ -0,0 +1,69 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_cspnext_t_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.665 | 0.874 | 0.723 | 0.723 | 0.917 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-coco_pt-in1k_210e-256x192-0908dd2d_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-coco_pt-in1k_210e-256x192-0908dd2d_20230123.json) | +| [pose_cspnext_s_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.697 | 0.886 | 0.776 | 0.753 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-coco_pt-in1k_210e-256x192-92dbfc1d_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-coco_pt-in1k_210e-256x192-92dbfc1d_20230123.json) | +| [pose_cspnext_m_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.732 | 0.896 | 0.806 | 0.785 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco_pt-in1k_210e-256x192-95f5967e_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco_pt-in1k_210e-256x192-95f5967e_20230123.json) | +| [pose_cspnext_l_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.750 | 0.904 | 0.822 | 0.800 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-coco_pt-in1k_210e-256x192-661cdd8c_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-coco_pt-in1k_210e-256x192-661cdd8c_20230123.json) | +| [pose_cspnext_t_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.655 | 0.884 | 0.731 | 0.689 | 0.890 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.json) | +| [pose_cspnext_s_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.700 | 0.905 | 0.783 | 0.733 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.json) | +| [pose_cspnext_m_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.748 | 0.925 | 0.818 | 0.777 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.json) | +| [pose_cspnext_l_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.772 | 0.936 | 0.839 | 0.799 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.json) | + +Note that, UDP also adopts the unbiased encoding/decoding algorithm of [DARK](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/techniques.html#darkpose-cvpr-2020). + +Flip test and detector is not used in the result of aic-coco training. diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.yml new file mode 100644 index 0000000..b1d9cd8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.yml @@ -0,0 +1,139 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py + In Collection: UDP + Metadata: + Architecture: &id001 + - CSPNeXt + - UDP + Training Data: COCO + Name: cspnext-tiny_udp_8xb256-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.665 + AP@0.5: 0.874 + AP@0.75: 0.723 + AR: 0.723 + AR@0.5: 0.917 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-coco_pt-in1k_210e-256x192-0908dd2d_20230123.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: COCO + Name: cspnext-s_udp_8xb256-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.697 + AP@0.5: 0.886 + AP@0.75: 0.776 + AR: 0.753 + AR@0.5: 0.929 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-coco_pt-in1k_210e-256x192-92dbfc1d_20230123.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: COCO + Name: cspnext-m_udp_8xb256-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.732 + AP@0.5: 0.896 + AP@0.75: 0.806 + AR: 0.785 + AR@0.5: 0.937 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco_pt-in1k_210e-256x192-95f5967e_20230123.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: COCO + Name: cspnext-l_udp_8xb256-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.750 + AP@0.5: 0.904 + AP@0.75: 0.822 + AR: 0.8 + AR@0.5: 0.941 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-coco_pt-in1k_210e-256x192-661cdd8c_20230123.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: + - COCO + - AIC + Name: cspnext-tiny_udp_8xb256-210e_aic-coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.655 + AP@0.5: 0.884 + AP@0.75: 0.731 + AR: 0.689 + AR@0.5: 0.89 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: + - COCO + - AIC + Name: cspnext-s_udp_8xb256-210e_aic-coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.7 + AP@0.5: 0.905 + AP@0.75: 0.783 + AR: 0.733 + AR@0.5: 0.918 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: + - COCO + - AIC + Name: cspnext-m_udp_8xb256-210e_aic-coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.748 + AP@0.5: 0.925 + AP@0.75: 0.818 + AR: 0.777 + AR@0.5: 0.933 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: + - COCO + - AIC + Name: cspnext-l_udp_8xb256-210e_aic-coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.772 + AP@0.5: 0.936 + AP@0.75: 0.839 + AR: 0.799 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.md new file mode 100644 index 0000000..e66d13e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.md @@ -0,0 +1,42 @@ + + +
+Hourglass (ECCV'2016) + +```bibtex +@inproceedings{newell2016stacked, + title={Stacked hourglass networks for human pose estimation}, + author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia}, + booktitle={European conference on computer vision}, + pages={483--499}, + year={2016}, + organization={Springer} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hourglass_52](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py) | 256x256 | 0.726 | 0.896 | 0.799 | 0.780 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256-4ec713ba_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256_20200709.log.json) | +| [pose_hourglass_52](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py) | 384x384 | 0.746 | 0.900 | 0.812 | 0.797 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384-be91ba2b_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384_20200812.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml new file mode 100644 index 0000000..23d2a9b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml @@ -0,0 +1,40 @@ +Collections: +- Name: Hourglass + Paper: + Title: Stacked hourglass networks for human pose estimation + URL: https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hourglass.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py + In Collection: Hourglass + Metadata: + Architecture: &id001 + - Hourglass + Training Data: COCO + Name: td-hm_hourglass52_8xb32-210e_coco-256x256 + Results: + - Dataset: COCO + Metrics: + AP: 0.726 + AP@0.5: 0.896 + AP@0.75: 0.799 + AR: 0.780 + AR@0.5: 0.934 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256-4ec713ba_20200709.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py + In Collection: Hourglass + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hourglass52_8xb32-210e_coco-384x384 + Results: + - Dataset: COCO + Metrics: + AP: 0.746 + AP@0.5: 0.900 + AP@0.75: 0.812 + AR: 0.797 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384-be91ba2b_20200812.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md new file mode 100644 index 0000000..d49774d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md @@ -0,0 +1,43 @@ + + +
+HRFormer (NIPS'2021) + +```bibtex +@article{yuan2021hrformer, + title={HRFormer: High-Resolution Vision Transformer for Dense Predict}, + author={Yuan, Yuhui and Fu, Rao and Huang, Lang and Lin, Weihong and Zhang, Chao and Chen, Xilin and Wang, Jingdong}, + journal={Advances in Neural Information Processing Systems}, + volume={34}, + year={2021} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrformer_small](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py) | 256x192 | 0.738 | 0.904 | 0.812 | 0.793 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192-5310d898_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192_20220316.log.json) | +| [pose_hrformer_small](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py) | 384x288 | 0.757 | 0.905 | 0.824 | 0.807 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288-98d237ed_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288_20220316.log.json) | +| [pose_hrformer_base](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py) | 256x192 | 0.754 | 0.906 | 0.827 | 0.807 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192-6f5f1169_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192_20220316.log.json) | +| [pose_hrformer_base](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py) | 384x288 | 0.774 | 0.909 | 0.842 | 0.823 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288-ecf0758d_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288_20220316.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.yml new file mode 100644 index 0000000..81e8d2b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.yml @@ -0,0 +1,72 @@ +Collections: +- Name: HRFormer + Paper: + Title: 'HRFormer: High-Resolution Vision Transformer for Dense Predict' + URL: https://proceedings.neurips.cc/paper/2021/hash/3bbfdde8842a5c44a0323518eec97cbe-Abstract.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrformer.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py + In Collection: HRFormer + Metadata: + Architecture: &id001 + - HRFormer + Training Data: COCO + Name: td-hm_hrformer-small_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.738 + AP@0.5: 0.904 + AP@0.75: 0.812 + AR: 0.793 + AR@0.5: 0.941 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192-5310d898_20220316.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py + In Collection: HRFormer + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrformer-small_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.757 + AP@0.5: 0.905 + AP@0.75: 0.824 + AR: 0.807 + AR@0.5: 0.941 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288-98d237ed_20220316.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py + In Collection: HRFormer + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrformer-base_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.754 + AP@0.5: 0.906 + AP@0.75: 0.827 + AR: 0.807 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192-6f5f1169_20220316.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py + In Collection: HRFormer + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrformer-base_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.774 + AP@0.5: 0.909 + AP@0.75: 0.842 + AR: 0.823 + AR@0.5: 0.945 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288-ecf0758d_20220316.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.md new file mode 100644 index 0000000..010ecdb --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.md @@ -0,0 +1,62 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+Albumentations (Information'2020) + +```bibtex +@article{buslaev2020albumentations, + title={Albumentations: fast and flexible image augmentations}, + author={Buslaev, Alexander and Iglovikov, Vladimir I and Khvedchenya, Eugene and Parinov, Alex and Druzhinin, Mikhail and Kalinin, Alexandr A}, + journal={Information}, + volume={11}, + number={2}, + pages={125}, + year={2020}, + publisher={Multidisciplinary Digital Publishing Institute} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [coarsedropout](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py) | 256x192 | 0.753 | 0.908 | 0.822 | 0.805 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_coarsedropout-0f16a0ce_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_coarsedropout_20210320.log.json) | +| [gridmask](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py) | 256x192 | 0.752 | 0.906 | 0.825 | 0.804 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_gridmask-868180df_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_gridmask_20210320.log.json) | +| [photometric](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py) | 256x192 | 0.754 | 0.908 | 0.825 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_photometric-308cf591_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_photometric_20210320.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.yml new file mode 100644 index 0000000..b31ef80 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.yml @@ -0,0 +1,56 @@ +Collections: +- Name: Albumentations + Paper: + Title: 'Albumentations: fast and flexible image augmentations' + URL: https://www.mdpi.com/649002 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/albumentations.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py + In Collection: Albumentations + Metadata: + Architecture: &id001 + - HRNet + Training Data: COCO + Name: td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.753 + AP@0.5: 0.908 + AP@0.75: 0.822 + AR: 0.805 + AR@0.5: 0.944 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_coarsedropout-0f16a0ce_20210320.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py + In Collection: Albumentations + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.752 + AP@0.5: 0.906 + AP@0.75: 0.825 + AR: 0.804 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_gridmask-868180df_20210320.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py + In Collection: Albumentations + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.754 + AP@0.5: 0.908 + AP@0.75: 0.825 + AR: 0.805 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_photometric-308cf591_20210320.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.md new file mode 100644 index 0000000..f8c09f3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.md @@ -0,0 +1,43 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | 256x192 | 0.749 | 0.906 | 0.821 | 0.804 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py) | 384x288 | 0.761 | 0.908 | 0.826 | 0.811 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288-ca5956af_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288_20220909.log) | +| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py) | 256x192 | 0.756 | 0.908 | 0.826 | 0.809 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192_20220913.log) | +| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py) | 384x288 | 0.767 | 0.911 | 0.832 | 0.817 | 0.947 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288-c161b7de_20220915.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288_20220915.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml new file mode 100644 index 0000000..525a496 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml @@ -0,0 +1,124 @@ +Collections: +- Name: HRNet + Paper: + Title: Deep high-resolution representation learning for human pose estimation + URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrnet.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py + In Collection: HRNet + Metadata: + Architecture: &id001 + - HRNet + Training Data: COCO + Name: td-hm_hrnet-w32_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.746 + AP@0.5: 0.904 + AP@0.75: 0.819 + AR: 0.799 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w32_8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.76 + AP@0.5: 0.906 + AP@0.75: 0.83 + AR: 0.81 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288-ca5956af_20220909.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w48_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.756 + AP@0.5: 0.907 + AP@0.75: 0.825 + AR: 0.806 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w48_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.767 + AP@0.5: 0.91 + AP@0.75: 0.831 + AR: 0.816 + AR@0.5: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288-c161b7de_20220915.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: + - COCO + - AI Challenger + Name: td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge + Results: + - Dataset: COCO + Metrics: + AP: 0.757 + AP@0.5: 0.907 + AP@0.75: 0.829 + AR: 0.809 + AR@0.5: 0.944 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge-b05435b9_20221025.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: + - COCO + - AI Challenger + Name: td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine + Results: + - Dataset: COCO + Metrics: + AP: 0.756 + AP@0.5: 0.906 + AP@0.75: 0.826 + AR: 0.807 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine-4ce66880_20221026.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.749 + AP@0.5: 0.907 + AP@0.75: 0.822 + AR: 0.802 + AR@0.5: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192-f1e84e3b_20220914.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco_aic.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco_aic.md new file mode 100644 index 0000000..456acdd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco_aic.md @@ -0,0 +1,61 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +
+AI Challenger (ArXiv'2017) + +```bibtex +@article{wu2017ai, + title={Ai challenger: A large-scale dataset for going deeper in image understanding}, + author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others}, + journal={arXiv preprint arXiv:1711.06475}, + year={2017} +} +``` + +
+ +MMPose supports training model with combined datasets. [coco-aic-merge](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) and [coco-aic-combine](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) are two examples. + +- [coco-aic-merge](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) leverages AIC data with partial keypoints as auxiliary data to train a COCO model +- [coco-aic-combine](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) constructs a combined dataset whose keypoints are the union of COCO and AIC keypoints to train a model that predicts keypoints of both datasets. + +Evaluation results on COCO val2017 of models trained with solely COCO dataset and combined dataset as shown below. These models are evaluated with detector having human AP of 56.4 on COCO val2017 dataset. + +| Train Set | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :------------------------------------------- | :------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------: | :------------------------------------: | +| [coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | pose_hrnet_w32 | 256x192 | 0.749 | 0.906 | 0.821 | 0.804 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) | +| [coco-aic-merge](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) | pose_hrnet_w32 | 256x192 | 0.756 | 0.907 | 0.828 | 0.809 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge-a9ea6d77_20230818.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge_20230818.json) | +| [coco-aic-combine](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) | pose_hrnet_w32 | 256x192 | 0.755 | 0.904 | 0.825 | 0.807 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine-458125cc_20230818.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine_20230818.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.md new file mode 100644 index 0000000..89fa371 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.md @@ -0,0 +1,60 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.907 | 0.825 | 0.807 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192-0e00bf12_20220914.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192_20220914.log) | +| [pose_hrnet_w32_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.766 | 0.907 | 0.829 | 0.815 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288-9bab4c9b_20220917.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288_20220917.log) | +| [pose_hrnet_w48_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py) | 256x192 | 0.764 | 0.907 | 0.831 | 0.814 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192-e1ebdd6f_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192_20220913.log) | +| [pose_hrnet_w48_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py) | 384x288 | 0.772 | 0.911 | 0.833 | 0.821 | 0.948 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288-39c3c381_20220916.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288_20220916.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.yml new file mode 100644 index 0000000..ae3d2df --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.yml @@ -0,0 +1,73 @@ +Collections: +- Name: DarkPose + Paper: + Title: Distribution-aware coordinate representation for human pose estimation + URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/dark.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py + In Collection: DarkPose + Metadata: + Architecture: &id001 + - HRNet + - DarkPose + Training Data: COCO + Name: td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.757 + AP@0.5: 0.907 + AP@0.75: 0.825 + AR: 0.807 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192-0e00bf12_20220914.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.766 + AP@0.5: 0.907 + AP@0.75: 0.829 + AR: 0.815 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288-9bab4c9b_20220917.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.764 + AP@0.5: 0.907 + AP@0.75: 0.831 + AR: 0.814 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192-e1ebdd6f_20220913.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.772 + AP@0.5: 0.911 + AP@0.75: 0.833 + AR: 0.821 + AR@0.5: 0.948 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288-39c3c381_20220916.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_fp16_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_fp16_coco.md new file mode 100644 index 0000000..79aa611 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_fp16_coco.md @@ -0,0 +1,56 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+FP16 (ArXiv'2017) + +```bibtex +@article{micikevicius2017mixed, + title={Mixed precision training}, + author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others}, + journal={arXiv preprint arXiv:1710.03740}, + year={2017} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32_fp16](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py) | 256x192 | 0.749 | 0.907 | 0.822 | 0.802 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192-f1e84e3b_20220914.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192_20220914.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md new file mode 100644 index 0000000..988df0f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md @@ -0,0 +1,63 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py) | 256x192 | 0.762 | 0.907 | 0.829 | 0.810 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192-73ede547_20220914.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192_20220914.log) | +| [pose_hrnet_w32_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py) | 384x288 | 0.768 | 0.909 | 0.832 | 0.815 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288-9a3f7c85_20220914.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288_20220914.log) | +| [pose_hrnet_w48_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py) | 256x192 | 0.768 | 0.908 | 0.833 | 0.817 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192-3feaef8f_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192_20220913.log) | +| [pose_hrnet_w48_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py) | 384x288 | 0.773 | 0.911 | 0.836 | 0.821 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288-70d7ab01_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288_20220913.log) | +| [pose_hrnet_w32_udp_regress](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py) | 256x192 | 0.759 | 0.907 | 0.827 | 0.813 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192-9c0b77b4_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192_20220226.log) | + +Note that, UDP also adopts the unbiased encoding/decoding algorithm of [DARK](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/techniques.html#darkpose-cvpr-2020). diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.yml new file mode 100644 index 0000000..3971f52 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.yml @@ -0,0 +1,90 @@ +Collections: +- Name: UDP + Paper: + Title: 'The Devil Is in the Details: Delving Into Unbiased Data Processing for + Human Pose Estimation' + URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/udp.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py + In Collection: UDP + Metadata: + Architecture: &id001 + - HRNet + - UDP + Training Data: COCO + Name: td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.762 + AP@0.5: 0.907 + AP@0.75: 0.829 + AR: 0.810 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192-73ede547_20220914.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.768 + AP@0.5: 0.909 + AP@0.75: 0.832 + AR: 0.815 + AR@0.5: 0.945 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288-9a3f7c85_20220914.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.768 + AP@0.5: 0.908 + AP@0.75: 0.833 + AR: 0.817 + AR@0.5: 0.945 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192-3feaef8f_20220913.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.773 + AP@0.5: 0.911 + AP@0.75: 0.836 + AR: 0.821 + AR@0.5: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288-70d7ab01_20220913.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.759 + AP@0.5: 0.907 + AP@0.75: 0.827 + AR: 0.813 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192-9c0b77b4_20220926.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.md new file mode 100644 index 0000000..2bdb62d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.md @@ -0,0 +1,42 @@ + + +
+LiteHRNet (CVPR'2021) + +```bibtex +@inproceedings{Yulitehrnet21, + title={Lite-HRNet: A Lightweight High-Resolution Network}, + author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong}, + booktitle={CVPR}, + year={2021} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [LiteHRNet-18](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py) | 256x192 | 0.642 | 0.867 | 0.719 | 0.705 | 0.911 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_256x192-6bace359_20211230.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_256x192_20211230.log.json) | +| [LiteHRNet-18](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py) | 384x288 | 0.676 | 0.876 | 0.746 | 0.735 | 0.919 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_384x288-8d4dac48_20211230.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_384x288_20211230.log.json) | +| [LiteHRNet-30](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py) | 256x192 | 0.676 | 0.880 | 0.756 | 0.736 | 0.922 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_256x192-4176555b_20210626.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_256x192_20210626.log.json) | +| [LiteHRNet-30](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py) | 384x288 | 0.700 | 0.883 | 0.776 | 0.758 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_384x288-a3aef5c4_20210626.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_384x288_20210626.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.yml new file mode 100644 index 0000000..11ecf92 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.yml @@ -0,0 +1,72 @@ +Collections: +- Name: LiteHRNet + Paper: + Title: 'Lite-HRNet: A Lightweight High-Resolution Network' + URL: https://arxiv.org/abs/2104.06403 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/litehrnet.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py + In Collection: LiteHRNet + Metadata: + Architecture: &id001 + - LiteHRNet + Training Data: COCO + Name: td-hm_litehrnet-18_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.642 + AP@0.5: 0.867 + AP@0.75: 0.719 + AR: 0.705 + AR@0.5: 0.911 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_256x192-6bace359_20211230.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py + In Collection: LiteHRNet + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_litehrnet-18_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.676 + AP@0.5: 0.876 + AP@0.75: 0.746 + AR: 0.735 + AR@0.5: 0.919 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_384x288-8d4dac48_20211230.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py + In Collection: LiteHRNet + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_litehrnet-30_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.676 + AP@0.5: 0.88 + AP@0.75: 0.756 + AR: 0.736 + AR@0.5: 0.922 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_256x192-4176555b_20210626.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py + In Collection: LiteHRNet + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_litehrnet-30_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.7 + AP@0.5: 0.883 + AP@0.75: 0.776 + AR: 0.758 + AR@0.5: 0.926 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_384x288-a3aef5c4_20210626.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md new file mode 100644 index 0000000..7df4a42 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md @@ -0,0 +1,41 @@ + + +
+MobilenetV2 (CVPR'2018) + +```bibtex +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py) | 256x192 | 0.648 | 0.874 | 0.725 | 0.709 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192-55a04c35_20221016.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192_20221016.log) | +| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py) | 384x288 | 0.677 | 0.882 | 0.746 | 0.734 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288-d3ab1457_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288_20221013.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.yml new file mode 100644 index 0000000..644a6b6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.yml @@ -0,0 +1,35 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - MobilenetV2 + Training Data: COCO + Name: td-hm_mobilenetv2_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.648 + AP@0.5: 0.874 + AP@0.75: 0.725 + AR: 0.709 + AR@0.5: 0.918 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192-55a04c35_20221016.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_mobilenetv2_8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.677 + AP@0.5: 0.882 + AP@0.75: 0.746 + AR: 0.734 + AR@0.5: 0.920 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288-d3ab1457_20221013.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.md new file mode 100644 index 0000000..a67cd63 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.md @@ -0,0 +1,42 @@ + + +
+MSPN (ArXiv'2019) + +```bibtex +@article{li2019rethinking, + title={Rethinking on Multi-Stage Networks for Human Pose Estimation}, + author={Li, Wenbo and Wang, Zhicheng and Yin, Binyi and Peng, Qixiang and Du, Yuming and Xiao, Tianzi and Yu, Gang and Lu, Hongtao and Wei, Yichen and Sun, Jian}, + journal={arXiv preprint arXiv:1901.00148}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [mspn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.723 | 0.895 | 0.794 | 0.788 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/mspn50_coco_256x192-8fbfb5d0_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/mspn50_coco_256x192_20201123.log.json) | +| [2xmspn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.754 | 0.903 | 0.826 | 0.816 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/2xmspn50_coco_256x192-c8765a5c_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/2xmspn50_coco_256x192_20201123.log.json) | +| [3xmspn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.758 | 0.904 | 0.830 | 0.821 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/3xmspn50_coco_256x192-e348f18e_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/3xmspn50_coco_256x192_20201123.log.json) | +| [4xmspn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.765 | 0.906 | 0.835 | 0.826 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/4xmspn50_coco_256x192-7b837afb_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/4xmspn50_coco_256x192_20201123.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.yml new file mode 100644 index 0000000..1165bbc --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.yml @@ -0,0 +1,72 @@ +Collections: +- Name: MSPN + Paper: + Title: Rethinking on Multi-Stage Networks for Human Pose Estimation + URL: https://arxiv.org/abs/1901.00148 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/mspn.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py + In Collection: MSPN + Metadata: + Architecture: &id001 + - MSPN + Training Data: COCO + Name: td-hm_mspn50_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.723 + AP@0.5: 0.895 + AP@0.75: 0.794 + AR: 0.788 + AR@0.5: 0.934 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/mspn/mspn50_coco_256x192-8fbfb5d0_20201123.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py + In Collection: MSPN + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_2xmspn50_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.754 + AP@0.5: 0.903 + AP@0.75: 0.826 + AR: 0.816 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/mspn/2xmspn50_coco_256x192-c8765a5c_20201123.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py + In Collection: MSPN + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_3xmspn50_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.758 + AP@0.5: 0.904 + AP@0.75: 0.83 + AR: 0.821 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/mspn/3xmspn50_coco_256x192-e348f18e_20201123.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py + In Collection: MSPN + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_4xmspn50_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.765 + AP@0.5: 0.906 + AP@0.75: 0.835 + AR: 0.826 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/mspn/4xmspn50_coco_256x192-7b837afb_20201123.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.md new file mode 100644 index 0000000..74a189d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.md @@ -0,0 +1,57 @@ + + +
+PVT (ICCV'2021) + +```bibtex +@inproceedings{wang2021pyramid, + title={Pyramid vision transformer: A versatile backbone for dense prediction without convolutions}, + author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={568--578}, + year={2021} +} +``` + +
+ +
+PVTV2 (CVMJ'2022) + +```bibtex +@article{wang2022pvt, + title={PVT v2: Improved baselines with Pyramid Vision Transformer}, + author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling}, + journal={Computational Visual Media}, + pages={1--10}, + year={2022}, + publisher={Springer} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_pvt-s](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py) | 256x192 | 0.714 | 0.896 | 0.794 | 0.773 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/top_down/pvt/pvt_small_coco_256x192-4324a49d_20220501.pth) | [log](https://download.openmmlab.com/mmpose/top_down/pvt/pvt_small_coco_256x192_20220501.log.json) | +| [pose_pvtv2-b2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py) | 256x192 | 0.737 | 0.905 | 0.812 | 0.791 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/pvt/pvtv2_b2_coco_256x192-b4212737_20220501.pth) | [log](https://download.openmmlab.com/mmpose/top_down/pvt/pvtv2_b2_coco_256x192_20220501.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.yml new file mode 100644 index 0000000..202ec81 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.yml @@ -0,0 +1,35 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - PVT + Training Data: COCO + Name: td-hm_pvt-s_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.714 + AP@0.5: 0.896 + AP@0.75: 0.794 + AR: 0.773 + AR@0.5: 0.936 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/pvt/pvt_small_coco_256x192-4324a49d_20220501.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_pvtv2-b2_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.737 + AP@0.5: 0.905 + AP@0.75: 0.812 + AR: 0.791 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/pvt/pvtv2_b2_coco_256x192-b4212737_20220501.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.md new file mode 100644 index 0000000..8bee32c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.md @@ -0,0 +1,46 @@ + + +
+ResNeSt (ArXiv'2020) + +```bibtex +@article{zhang2020resnest, + title={ResNeSt: Split-Attention Networks}, + author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander}, + journal={arXiv preprint arXiv:2004.08955}, + year={2020} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_resnest_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py) | 256x192 | 0.720 | 0.899 | 0.800 | 0.775 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_256x192-6e65eece_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_256x192_20210320.log.json) | +| [pose_resnest_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py) | 384x288 | 0.737 | 0.900 | 0.811 | 0.789 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_384x288-dcd20436_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_384x288_20210320.log.json) | +| [pose_resnest_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py) | 256x192 | 0.725 | 0.900 | 0.807 | 0.781 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_256x192-2ffcdc9d_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_256x192_20210320.log.json) | +| [pose_resnest_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py) | 384x288 | 0.745 | 0.905 | 0.818 | 0.798 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_384x288-80660658_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_384x288_20210320.log.json) | +| [pose_resnest_200](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py) | 256x192 | 0.731 | 0.905 | 0.812 | 0.787 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_256x192-db007a48_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_256x192_20210517.log.json) | +| [pose_resnest_200](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py) | 384x288 | 0.753 | 0.907 | 0.827 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_384x288-b5bb76cb_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_384x288_20210517.log.json) | +| [pose_resnest_269](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py) | 256x192 | 0.737 | 0.907 | 0.819 | 0.792 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_256x192-2a7882ac_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_256x192_20210517.log.json) | +| [pose_resnest_269](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py) | 384x288 | 0.754 | 0.908 | 0.828 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_384x288-b142b9fb_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_384x288_20210517.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.yml new file mode 100644 index 0000000..d039829 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.yml @@ -0,0 +1,131 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNeSt + Training Data: COCO + Name: td-hm_resnest50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.720 + AP@0.5: 0.899 + AP@0.75: 0.8 + AR: 0.775 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_256x192-6e65eece_20210320.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnest50_8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.737 + AP@0.5: 0.9 + AP@0.75: 0.811 + AR: 0.789 + AR@0.5: 0.937 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_384x288-dcd20436_20210320.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnest101_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.725 + AP@0.5: 0.9 + AP@0.75: 0.807 + AR: 0.781 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_256x192-2ffcdc9d_20210320.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnest101_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.745 + AP@0.5: 0.905 + AP@0.75: 0.818 + AR: 0.798 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_384x288-80660658_20210320.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnest200_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.731 + AP@0.5: 0.905 + AP@0.75: 0.812 + AR: 0.787 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_256x192-db007a48_20210517.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnest200_8xb16-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.753 + AP@0.5: 0.907 + AP@0.75: 0.827 + AR: 0.805 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_384x288-b5bb76cb_20210517.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnest269_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.737 + AP@0.5: 0.907 + AP@0.75: 0.819 + AR: 0.792 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_256x192-2a7882ac_20210517.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnest269_8xb16-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.754 + AP@0.5: 0.908 + AP@0.75: 0.828 + AR: 0.805 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_384x288-b142b9fb_20210517.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md new file mode 100644 index 0000000..83b7a90 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md @@ -0,0 +1,68 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.718 | 0.898 | 0.796 | 0.774 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192-04af38ce_20220923.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192_20220923.log) | +| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py) | 384x288 | 0.731 | 0.900 | 0.799 | 0.782 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288-7b8db90e_20220923.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288_20220923.log) | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py) | 256x192 | 0.728 | 0.904 | 0.809 | 0.783 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192_20220926.log) | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py) | 384x288 | 0.749 | 0.906 | 0.817 | 0.799 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192_20220926.log) | +| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py) | 256x192 | 0.736 | 0.904 | 0.818 | 0.791 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192-0345f330_20220928.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192_20220928.log) | +| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py) | 384x288 | 0.750 | 0.908 | 0.821 | 0.800 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288-7fbb906f_20220927.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288_20220927.log) | + +The following model is equipped with a visibility prediction head and has been trained using COCO and AIC datasets. + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py) | 256x192 | 0.729 | 0.900 | 0.807 | 0.783 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge-21815b2c_20230726.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192_20220923.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.yml new file mode 100644 index 0000000..ad6dce9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.yml @@ -0,0 +1,121 @@ +Collections: +- Name: SimpleBaseline2D + Paper: + Title: Simple baselines for human pose estimation and tracking + URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/simplebaseline2d.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: COCO + Name: td-hm_res50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.718 + AP@0.5: 0.898 + AP@0.75: 0.796 + AR: 0.774 + AR@0.5: 0.934 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192-04af38ce_20220923.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res50_8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.731 + AP@0.5: 0.9 + AP@0.75: 0.799 + AR: 0.782 + AR@0.5: 0.937 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288-7b8db90e_20220923.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res101_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.728 + AP@0.5: 0.904 + AP@0.75: 0.809 + AR: 0.783 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res101_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.749 + AP@0.5: 0.906 + AP@0.75: 0.817 + AR: 0.799 + AR@0.5: 0.941 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res152_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.736 + AP@0.5: 0.904 + AP@0.75: 0.818 + AR: 0.791 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192-0345f330_20220928.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res152_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.75 + AP@0.5: 0.908 + AP@0.75: 0.821 + AR: 0.8 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288-7fbb906f_20220927.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res50_fp16-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.716 + AP@0.5: 0.898 + AP@0.75: 0.798 + AR: 0.772 + AR@0.5: 0.937 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192-463da051_20220927.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md new file mode 100644 index 0000000..9156fb4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md @@ -0,0 +1,79 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_resnet_50_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.724 | 0.897 | 0.797 | 0.777 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192-c129dcb6_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192_20220926.log) | +| [pose_resnet_50_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.735 | 0.902 | 0.801 | 0.786 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288-8b90b538_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288_20220926.log) | +| [pose_resnet_101_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.733 | 0.900 | 0.810 | 0.786 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192-528ec248_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192_20220926.log) | +| [pose_resnet_101_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.749 | 0.905 | 0.818 | 0.799 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288-487d40a4_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288_20220926.log) | +| [pose_resnet_152_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py) | 256x192 | 0.743 | 0.906 | 0.819 | 0.796 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192-f754df5f_20221031.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192_20221031.log) | +| [pose_resnet_152_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py) | 384x288 | 0.755 | 0.907 | 0.825 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288-329f8454_20221031.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288_20221031.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.yml new file mode 100644 index 0000000..c5e156f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.yml @@ -0,0 +1,100 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py + In Collection: DarkPose + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + - DarkPose + Training Data: COCO + Name: td-hm_res50_dark-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.724 + AP@0.5: 0.897 + AP@0.75: 0.797 + AR: 0.777 + AR@0.5: 0.934 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192-c129dcb6_20220926.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res50_dark-8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.735 + AP@0.5: 0.902 + AP@0.75: 0.801 + AR: 0.786 + AR@0.5: 0.938 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288-8b90b538_20220926.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res101_dark-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.733 + AP@0.5: 0.9 + AP@0.75: 0.81 + AR: 0.786 + AR@0.5: 0.938 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192-528ec248_20220926.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res101_dark-8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.749 + AP@0.5: 0.905 + AP@0.75: 0.818 + AR: 0.799 + AR@0.5: 0.94 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288-487d40a4_20220926.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res152_dark-8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.743 + AP@0.5: 0.906 + AP@0.75: 0.819 + AR: 0.796 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192-f754df5f_20221031.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_res152_dark-8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.757 + AP@0.5: 0.907 + AP@0.75: 0.825 + AR: 0.805 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288-329f8454_20221031.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md new file mode 100644 index 0000000..8785e3b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md @@ -0,0 +1,73 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+FP16 (ArXiv'2017) + +```bibtex +@article{micikevicius2017mixed, + title={Mixed precision training}, + author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others}, + journal={arXiv preprint arXiv:1710.03740}, + year={2017} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_resnet_50_fp16](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py) | 256x192 | 0.716 | 0.898 | 0.798 | 0.772 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192-463da051_20220927.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192_20220927.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md new file mode 100644 index 0000000..59ac34f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md @@ -0,0 +1,45 @@ + + +
+ResNetV1D (CVPR'2019) + +```bibtex +@inproceedings{he2019bag, + title={Bag of tricks for image classification with convolutional neural networks}, + author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={558--567}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py) | 256x192 | 0.722 | 0.897 | 0.796 | 0.777 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192-27545d63_20221020.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192_20221020.log) | +| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py) | 384x288 | 0.730 | 0.899 | 0.800 | 0.782 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288-0646b46e_20221020.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288_20221020.log) | +| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py) | 256x192 | 0.732 | 0.901 | 0.808 | 0.785 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192-ee9e7212_20221021.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192_20221021.log) | +| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py) | 384x288 | 0.748 | 0.906 | 0.817 | 0.798 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288-d0b5875f_20221028.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288_20221028.log) | +| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py) | 256x192 | 0.737 | 0.904 | 0.814 | 0.790 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192-fd49f947_20221021.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192_20221021.log) | +| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py) | 384x288 | 0.751 | 0.907 | 0.821 | 0.801 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288-b9a99602_20221022.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288_20221022.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.yml new file mode 100644 index 0000000..4acdfe4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.yml @@ -0,0 +1,99 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNetV1D + Training Data: COCO + Name: td-hm_resnetv1d50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.722 + AP@0.5: 0.897 + AP@0.75: 0.796 + AR: 0.777 + AR@0.5: 0.936 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192-27545d63_20221020.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnetv1d50_8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.73 + AP@0.5: 0.899 + AP@0.75: 0.8 + AR: 0.782 + AR@0.5: 0.935 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288-0646b46e_20221020.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnetv1d101_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.732 + AP@0.5: 0.901 + AP@0.75: 0.808 + AR: 0.785 + AR@0.5: 0.940 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192-ee9e7212_20221021.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnetv1d101_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.748 + AP@0.5: 0.906 + AP@0.75: 0.817 + AR: 0.798 + AR@0.5: 0.941 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288-d0b5875f_20221028.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnetv1d152_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.737 + AP@0.5: 0.904 + AP@0.75: 0.814 + AR: 0.790 + AR@0.5: 0.94 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192-fd49f947_20221021.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnetv1d152_8xb48-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.751 + AP@0.5: 0.907 + AP@0.75: 0.821 + AR: 0.801 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288-b9a99602_20221022.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.md new file mode 100644 index 0000000..ca7c1b5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.md @@ -0,0 +1,45 @@ + + +
+ResNext (CVPR'2017) + +```bibtex +@inproceedings{xie2017aggregated, + title={Aggregated residual transformations for deep neural networks}, + author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1492--1500}, + year={2017} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_resnext_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py) | 256x192 | 0.715 | 0.897 | 0.791 | 0.771 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_256x192-dcff15f6_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_256x192_20200727.log.json) | +| [pose_resnext_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py) | 384x288 | 0.724 | 0.899 | 0.794 | 0.777 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_384x288-412c848f_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_384x288_20200727.log.json) | +| [pose_resnext_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py) | 256x192 | 0.726 | 0.900 | 0.801 | 0.781 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_256x192-c7eba365_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_256x192_20200727.log.json) | +| [pose_resnext_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py) | 384x288 | 0.744 | 0.903 | 0.815 | 0.794 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_384x288-f5eabcd6_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_384x288_20200727.log.json) | +| [pose_resnext_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py) | 256x192 | 0.730 | 0.903 | 0.808 | 0.785 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_256x192-102449aa_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_256x192_20200727.log.json) | +| [pose_resnext_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py) | 384x288 | 0.742 | 0.904 | 0.810 | 0.794 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_384x288-806176df_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_384x288_20200727.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.yml new file mode 100644 index 0000000..29b02d2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.yml @@ -0,0 +1,99 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNext + Training Data: COCO + Name: td-hm_resnext50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.715 + AP@0.5: 0.897 + AP@0.75: 0.791 + AR: 0.771 + AR@0.5: 0.935 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_256x192-dcff15f6_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnext50_8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.724 + AP@0.5: 0.899 + AP@0.75: 0.794 + AR: 0.777 + AR@0.5: 0.936 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_384x288-412c848f_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnext101_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.726 + AP@0.5: 0.9 + AP@0.75: 0.801 + AR: 0.781 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_256x192-c7eba365_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnext101_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.744 + AP@0.5: 0.903 + AP@0.75: 0.815 + AR: 0.794 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_384x288-f5eabcd6_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnext152_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.73 + AP@0.5: 0.903 + AP@0.75: 0.808 + AR: 0.785 + AR@0.5: 0.94 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_256x192-102449aa_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_resnext152_8xb48-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.742 + AP@0.5: 0.904 + AP@0.75: 0.81 + AR: 0.794 + AR@0.5: 0.94 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_384x288-806176df_20200727.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md new file mode 100644 index 0000000..b5470d1 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md @@ -0,0 +1,44 @@ + + +
+RSN (ECCV'2020) + +```bibtex +@misc{cai2020learning, + title={Learning Delicate Local Representations for Multi-Person Pose Estimation}, + author={Yuanhao Cai and Zhicheng Wang and Zhengxiong Luo and Binyi Yin and Angang Du and Haoqian Wang and Xinyu Zhou and Erjin Zhou and Xiangyu Zhang and Jian Sun}, + year={2020}, + eprint={2003.04030}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [rsn_18](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py) | 256x192 | 0.704 | 0.887 | 0.781 | 0.773 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192-9049ed09_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192_20221013.log) | +| [rsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.724 | 0.894 | 0.799 | 0.790 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192-c35901d5_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192_20221013.log) | +| [2xrsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.748 | 0.900 | 0.821 | 0.810 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192-9ede341e_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192_20221013.log) | +| [3xrsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.750 | 0.900 | 0.824 | 0.814 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192-c3e3c4fe_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192_20221013.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.yml new file mode 100644 index 0000000..9ef71e1 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.yml @@ -0,0 +1,72 @@ +Collections: +- Name: RSN + Paper: + Title: Learning Delicate Local Representations for Multi-Person Pose Estimation + URL: https://link.springer.com/chapter/10.1007/978-3-030-58580-8_27 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/rsn.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py + In Collection: RSN + Metadata: + Architecture: &id001 + - RSN + Training Data: COCO + Name: td-hm_rsn18_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.704 + AP@0.5: 0.887 + AP@0.75: 0.781 + AR: 0.773 + AR@0.5: 0.927 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192-9049ed09_20221013.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py + In Collection: RSN + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_rsn50_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.724 + AP@0.5: 0.894 + AP@0.75: 0.799 + AR: 0.79 + AR@0.5: 0.935 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192-c35901d5_20221013.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py + In Collection: RSN + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_2xrsn50_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.748 + AP@0.5: 0.9 + AP@0.75: 0.821 + AR: 0.81 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192-9ede341e_20221013.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py + In Collection: RSN + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_3xrsn50_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.75 + AP@0.5: 0.9 + AP@0.75: 0.824 + AR: 0.814 + AR@0.5: 0.941 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192-c3e3c4fe_20221013.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.md new file mode 100644 index 0000000..c02ef7d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.md @@ -0,0 +1,43 @@ + + +
+SCNet (CVPR'2020) + +```bibtex +@inproceedings{liu2020improving, + title={Improving Convolutional Networks with Self-Calibrated Convolutions}, + author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={10096--10105}, + year={2020} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_scnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py) | 256x192 | 0.728 | 0.899 | 0.807 | 0.784 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_256x192-6920f829_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_256x192_20200709.log.json) | +| [pose_scnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py) | 384x288 | 0.751 | 0.906 | 0.818 | 0.802 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_384x288-9cacd0ea_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_384x288_20200709.log.json) | +| [pose_scnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py) | 256x192 | 0.733 | 0.902 | 0.811 | 0.789 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_256x192-6d348ef9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_256x192_20200709.log.json) | +| [pose_scnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py) | 384x288 | 0.752 | 0.906 | 0.823 | 0.804 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_384x288-0b6e631b_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_384x288_20200709.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.yml new file mode 100644 index 0000000..33d1f99 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.yml @@ -0,0 +1,66 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SCNet + Training Data: COCO + Name: td-hm_scnet50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.728 + AP@0.5: 0.899 + AP@0.75: 0.807 + AR: 0.784 + AR@0.5: 0.938 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_256x192-6920f829_20200709.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: topdown_heatmap_scnet50_coco_384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.751 + AP@0.5: 0.906 + AP@0.75: 0.818 + AR: 0.802 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_384x288-9cacd0ea_20200709.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_scnet101_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.733 + AP@0.5: 0.902 + AP@0.75: 0.811 + AR: 0.789 + AR@0.5: 0.94 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_256x192-6d348ef9_20200709.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_scnet101_8xb48-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.752 + AP@0.5: 0.906 + AP@0.75: 0.823 + AR: 0.804 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_384x288-0b6e631b_20200709.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.md new file mode 100644 index 0000000..f08f1f3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.md @@ -0,0 +1,47 @@ + + +
+SEResNet (CVPR'2018) + +```bibtex +@inproceedings{hu2018squeeze, + title={Squeeze-and-excitation networks}, + author={Hu, Jie and Shen, Li and Sun, Gang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={7132--7141}, + year={2018} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_seresnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py) | 256x192 | 0.729 | 0.903 | 0.807 | 0.784 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_256x192-25058b66_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_256x192_20200727.log.json) | +| [pose_seresnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py) | 384x288 | 0.748 | 0.904 | 0.819 | 0.799 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_384x288-bc0b7680_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_384x288_20200727.log.json) | +| [pose_seresnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py) | 256x192 | 0.734 | 0.905 | 0.814 | 0.790 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_256x192-83f29c4d_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_256x192_20200727.log.json) | +| [pose_seresnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py) | 384x288 | 0.754 | 0.907 | 0.823 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_384x288-48de1709_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_384x288_20200727.log.json) | +| [pose_seresnet_152\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py) | 256x192 | 0.730 | 0.899 | 0.810 | 0.787 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_256x192-1c628d79_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_256x192_20200727.log.json) | +| [pose_seresnet_152\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py) | 384x288 | 0.753 | 0.906 | 0.824 | 0.806 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_384x288-58b23ee8_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_384x288_20200727.log.json) | + +Note that * means without imagenet pre-training. diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.yml new file mode 100644 index 0000000..3a4f04a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.yml @@ -0,0 +1,98 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SEResNet + Training Data: COCO + Name: td-hm_seresnet50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.729 + AP@0.5: 0.903 + AP@0.75: 0.807 + AR: 0.784 + AR@0.5: 0.941 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_256x192-25058b66_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_seresnet50_8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.748 + AP@0.5: 0.904 + AP@0.75: 0.819 + AR: 0.799 + AR@0.5: 0.941 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_384x288-bc0b7680_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_seresnet101_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.734 + AP@0.5: 0.905 + AP@0.75: 0.814 + AR: 0.79 + AR@0.5: 0.941 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_256x192-83f29c4d_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_seresnet101_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.754 + AP@0.5: 0.907 + AP@0.75: 0.823 + AR: 0.805 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_384x288-48de1709_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_seresnet152_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.73 + AP@0.5: 0.899 + AP@0.75: 0.81 + AR: 0.787 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_256x192-1c628d79_20200727.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_seresnet152_8xb48-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.753 + AP@0.5: 0.906 + AP@0.75: 0.824 + AR: 0.806 + AR@0.5: 0.945 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_384x288-58b23ee8_20200727.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md new file mode 100644 index 0000000..d331889 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md @@ -0,0 +1,41 @@ + + +
+ShufflenetV1 (CVPR'2018) + +```bibtex +@inproceedings{zhang2018shufflenet, + title={Shufflenet: An extremely efficient convolutional neural network for mobile devices}, + author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={6848--6856}, + year={2018} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py) | 256x192 | 0.587 | 0.849 | 0.654 | 0.654 | 0.896 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192-7a7ea4f4_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192_20221013.log) | +| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py) | 384x288 | 0.626 | 0.862 | 0.696 | 0.687 | 0.903 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288-8342f8ba_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288_20221013.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.yml new file mode 100644 index 0000000..c20a130 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.yml @@ -0,0 +1,35 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ShufflenetV1 + Training Data: COCO + Name: td-hm_shufflenetv1_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.587 + AP@0.5: 0.849 + AP@0.75: 0.654 + AR: 0.654 + AR@0.5: 0.896 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192-7a7ea4f4_20221013.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_shufflenetv1_8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.626 + AP@0.5: 0.862 + AP@0.75: 0.696 + AR: 0.687 + AR@0.5: 0.903 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288-8342f8ba_20221013.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md new file mode 100644 index 0000000..3c80e76 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md @@ -0,0 +1,41 @@ + + +
+ShufflenetV2 (ECCV'2018) + +```bibtex +@inproceedings{ma2018shufflenet, + title={Shufflenet v2: Practical guidelines for efficient cnn architecture design}, + author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={116--131}, + year={2018} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py) | 256x192 | 0.602 | 0.857 | 0.672 | 0.668 | 0.902 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192-51fb931e_20221014.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192_20221014.log) | +| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py) | 384x288 | 0.638 | 0.866 | 0.707 | 0.699 | 0.910 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288-d30ab55c_20221014.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288_20221014.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.yml new file mode 100644 index 0000000..3c87873 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.yml @@ -0,0 +1,35 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ShufflenetV2 + Training Data: COCO + Name: td-hm_shufflenetv2_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.602 + AP@0.5: 0.857 + AP@0.75: 0.672 + AR: 0.668 + AR@0.5: 0.902 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192-51fb931e_20221014.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_shufflenetv2_8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.638 + AP@0.5: 0.866 + AP@0.75: 0.707 + AR: 0.699 + AR@0.5: 0.91 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288-d30ab55c_20221014.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.md new file mode 100644 index 0000000..0d142ce --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.md @@ -0,0 +1,78 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+Swin (ICCV'2021) + +```bibtex +@inproceedings{liu2021swin, + title={Swin transformer: Hierarchical vision transformer using shifted windows}, + author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={10012--10022}, + year={2021} +} +``` + +
+ + + +
+FPN (CVPR'2017) + +```bibtex +@inproceedings{lin2017feature, + title={Feature pyramid networks for object detection}, + author={Lin, Tsung-Yi and Doll{\'a}r, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={2117--2125}, + year={2017} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_swin_t](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py) | 256x192 | 0.724 | 0.901 | 0.806 | 0.782 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_t_p4_w7_coco_256x192-eaefe010_20220503.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_t_p4_w7_coco_256x192_20220503.log.json) | +| [pose_swin_b](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py) | 256x192 | 0.737 | 0.904 | 0.820 | 0.794 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_256x192-7432be9e_20220705.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_256x192_20220705.log.json) | +| [pose_swin_b](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py) | 384x288 | 0.759 | 0.910 | 0.832 | 0.811 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_384x288-3abf54f9_20220705.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_384x288_20220705.log.json) | +| [pose_swin_l](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py) | 256x192 | 0.743 | 0.906 | 0.821 | 0.798 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_256x192-642a89db_20220705.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_256x192_20220705.log.json) | +| [pose_swin_l](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py) | 384x288 | 0.763 | 0.912 | 0.830 | 0.814 | 0.949 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_384x288-c36b7845_20220705.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_384x288_20220705.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.yml new file mode 100644 index 0000000..569993e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.yml @@ -0,0 +1,99 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - Swin + Training Data: COCO + Name: td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.724 + AP@0.5: 0.901 + AP@0.75: 0.806 + AR: 0.782 + AR@0.5: 0.94 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_t_p4_w7_coco_256x192-eaefe010_20220503.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.737 + AP@0.5: 0.904 + AP@0.75: 0.82 + AR: 0.794 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_256x192-7432be9e_20220705.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.759 + AP@0.5: 0.91 + AP@0.75: 0.832 + AR: 0.811 + AR@0.5: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_384x288-3abf54f9_20220705.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.743 + AP@0.5: 0.906 + AP@0.75: 0.821 + AR: 0.798 + AR@0.5: 0.943 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_256x192-642a89db_20220705.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.763 + AP@0.5: 0.912 + AP@0.75: 0.83 + AR: 0.814 + AR@0.5: 0.949 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_384x288-c36b7845_20220705.pth +- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/swin_b_p4_w7_fpn_coco_256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO + Name: topdown_heatmap_swin_b_p4_w7_fpn_coco_256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.741 + AP@0.5: 0.907 + AP@0.75: 0.821 + AR: 0.798 + AR@0.5: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_fpn_coco_256x192-a3b91c45_20220705.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py new file mode 100644 index 0000000..dbed81b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py @@ -0,0 +1,167 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='VisPredictHead', + loss=dict( + type='BCELoss', + use_target_weight=True, + use_sigmoid=True, + loss_weight=1e-3, + ), + pose_cfg=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec)), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root='data/aic/', + data_mode=data_mode, + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + # score_mode='bbox', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..131a4fe --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py @@ -0,0 +1,152 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach. +kernel_sizes = [15, 11, 9, 7, 5] +codec = [ + dict( + type='MegviiHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + kernel_size=kernel_size) for kernel_size in kernel_sizes +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MSPN', + unit_channels=256, + num_stages=2, + num_units=4, + num_blocks=[3, 4, 6, 3], + norm_cfg=dict(type='BN'), + init_cfg=dict( + type='Pretrained', + checkpoint='torchvision://resnet50', + )), + head=dict( + type='MSPNHead', + out_shape=(64, 48), + unit_channels=256, + out_channels=17, + num_stages=2, + num_units=4, + norm_cfg=dict(type='BN'), + # each sub list is for a stage + # and each element in each list is for a unit + level_indices=[0, 1, 2, 3] + [1, 2, 3, 4], + loss=([ + dict( + type='KeypointMSELoss', + use_target_weight=True, + loss_weight=0.25) + ] * 3 + [ + dict( + type='KeypointOHKMMSELoss', + use_target_weight=True, + loss_weight=1.) + ]) * 2, + decoder=codec[-1]), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='GenerateTarget', multilevel=True, encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..0eb4a71 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach. +kernel_sizes = [15, 11, 9, 7, 5] +codec = [ + dict( + type='MegviiHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + kernel_size=kernel_size) for kernel_size in kernel_sizes +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='RSN', + unit_channels=256, + num_stages=2, + num_units=4, + num_blocks=[3, 4, 6, 3], + num_steps=4, + norm_cfg=dict(type='BN'), + ), + head=dict( + type='MSPNHead', + out_shape=(64, 48), + unit_channels=256, + out_channels=17, + num_stages=2, + num_units=4, + norm_cfg=dict(type='BN'), + # each sub list is for a stage + # and each element in each list is for a unit + level_indices=[0, 1, 2, 3] + [1, 2, 3, 4], + loss=([ + dict( + type='KeypointMSELoss', + use_target_weight=True, + loss_weight=0.25) + ] * 3 + [ + dict( + type='KeypointOHKMMSELoss', + use_target_weight=True, + loss_weight=1.) + ]) * 2, + decoder=codec[-1]), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='GenerateTarget', multilevel=True, encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none') +test_evaluator = val_evaluator + +# fp16 settings +fp16 = dict(loss_scale='dynamic') diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..0d3020d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py @@ -0,0 +1,152 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach. +kernel_sizes = [15, 11, 9, 7, 5] +codec = [ + dict( + type='MegviiHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + kernel_size=kernel_size) for kernel_size in kernel_sizes +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MSPN', + unit_channels=256, + num_stages=3, + num_units=4, + num_blocks=[3, 4, 6, 3], + norm_cfg=dict(type='BN'), + init_cfg=dict( + type='Pretrained', + checkpoint='torchvision://resnet50', + )), + head=dict( + type='MSPNHead', + out_shape=(64, 48), + unit_channels=256, + out_channels=17, + num_stages=3, + num_units=4, + norm_cfg=dict(type='BN'), + # each sub list is for a stage + # and each element in each list is for a unit + level_indices=[0, 1, 2, 3] * 2 + [1, 2, 3, 4], + loss=([ + dict( + type='KeypointMSELoss', + use_target_weight=True, + loss_weight=0.25) + ] * 3 + [ + dict( + type='KeypointOHKMMSELoss', + use_target_weight=True, + loss_weight=1.) + ]) * 3, + decoder=codec[-1]), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='GenerateTarget', multilevel=True, encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..afc35be --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach. +kernel_sizes = [15, 11, 9, 7, 5] +codec = [ + dict( + type='MegviiHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + kernel_size=kernel_size) for kernel_size in kernel_sizes +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='RSN', + unit_channels=256, + num_stages=3, + num_units=4, + num_blocks=[3, 4, 6, 3], + num_steps=4, + norm_cfg=dict(type='BN'), + ), + head=dict( + type='MSPNHead', + out_shape=(64, 48), + unit_channels=256, + out_channels=17, + num_stages=3, + num_units=4, + norm_cfg=dict(type='BN'), + # each sub list is for a stage + # and each element in each list is for a unit + level_indices=[0, 1, 2, 3] * 2 + [1, 2, 3, 4], + loss=([ + dict( + type='KeypointMSELoss', + use_target_weight=True, + loss_weight=0.25) + ] * 3 + [ + dict( + type='KeypointOHKMMSELoss', + use_target_weight=True, + loss_weight=1.) + ]) * 3, + decoder=codec[-1]), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='GenerateTarget', multilevel=True, encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none') +test_evaluator = val_evaluator + +# fp16 settings +fp16 = dict(loss_scale='dynamic') diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..a3870f4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py @@ -0,0 +1,152 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach. +kernel_sizes = [15, 11, 9, 7, 5] +codec = [ + dict( + type='MegviiHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + kernel_size=kernel_size) for kernel_size in kernel_sizes +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MSPN', + unit_channels=256, + num_stages=4, + num_units=4, + num_blocks=[3, 4, 6, 3], + norm_cfg=dict(type='BN'), + init_cfg=dict( + type='Pretrained', + checkpoint='torchvision://resnet50', + )), + head=dict( + type='MSPNHead', + out_shape=(64, 48), + unit_channels=256, + out_channels=17, + num_stages=4, + num_units=4, + norm_cfg=dict(type='BN'), + # each sub list is for a stage + # and each element in each list is for a unit + level_indices=[0, 1, 2, 3] * 3 + [1, 2, 3, 4], + loss=([ + dict( + type='KeypointMSELoss', + use_target_weight=True, + loss_weight=0.25) + ] * 3 + [ + dict( + type='KeypointOHKMMSELoss', + use_target_weight=True, + loss_weight=1.) + ]) * 4, + decoder=codec[-1]), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='GenerateTarget', multilevel=True, encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..9cedcfe --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py @@ -0,0 +1,153 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.75, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch='base', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.3, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_base_20230913.pth'), + ), + neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + deconv_out_channels=[], + deconv_kernel_sizes=[], + final_layer=dict(kernel_size=3, padding=1), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec, + ), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..f672aff --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.75, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch='base', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.3, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_base_20230913.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..6c8316b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py @@ -0,0 +1,153 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=32, + layer_decay_rate=0.85, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch='huge', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.55, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_huge_20230913.pth'), + ), + neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=17, + deconv_out_channels=[], + deconv_kernel_sizes=[], + final_layer=dict(kernel_size=3, padding=1), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec, + ), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..483231d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=32, + layer_decay_rate=0.85, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch='huge', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.55, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_huge_20230913.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..f5576bc --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py @@ -0,0 +1,153 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=24, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch='large', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.5, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_large_20230913.pth'), + ), + neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + deconv_out_channels=[], + deconv_kernel_sizes=[], + final_layer=dict(kernel_size=3, padding=1), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec, + ), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..d0f6adb --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=24, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch='large', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.5, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_large_20230913.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..4eedd6d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py @@ -0,0 +1,158 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch={ + 'embed_dims': 384, + 'num_layers': 12, + 'num_heads': 12, + 'feedforward_channels': 384 * 4 + }, + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.1, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_small_20230913.pth'), + ), + neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=17, + deconv_out_channels=[], + deconv_kernel_sizes=[], + final_layer=dict(kernel_size=3, padding=1), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec, + ), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..4918eea --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py @@ -0,0 +1,155 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch={ + 'embed_dims': 384, + 'num_layers': 12, + 'num_heads': 12, + 'feedforward_channels': 384 * 4 + }, + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.1, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_small_20230913.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..dcd903f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(40, 56), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='AlexNet', num_classes=-1), + head=dict( + type='HeatmapHead', + in_channels=256, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..5d71939 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(36, 48), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CPM', + in_channels=3, + out_channels=17, + feat_channels=128, + num_stages=6), + head=dict( + type='CPMHead', + in_channels=17, + out_channels=17, + num_stages=6, + deconv_out_channels=None, + final_layer=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..662a0fe --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(24, 32), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CPM', + in_channels=3, + out_channels=17, + feat_channels=128, + num_stages=6), + head=dict( + type='CPMHead', + in_channels=17, + out_channels=17, + num_stages=6, + deconv_out_channels=None, + final_layer=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py new file mode 100644 index 0000000..b83f3ce --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HourglassNet', + num_stacks=1, + ), + head=dict( + type='CPMHead', + in_channels=256, + out_channels=17, + num_stages=1, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py new file mode 100644 index 0000000..86e35f8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(384, 384), heatmap_size=(96, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HourglassNet', + num_stacks=1, + ), + head=dict( + type='CPMHead', + in_channels=256, + out_channels=17, + num_stages=1, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..6537ad0 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py @@ -0,0 +1,174 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=5e-4, + betas=(0.9, 0.999), + weight_decay=0.01, + ), + paramwise_cfg=dict( + custom_keys={'relative_position_bias_table': dict(decay_mult=0.)})) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRFormer', + in_channels=3, + norm_cfg=norm_cfg, + extra=dict( + drop_path_rate=0.2, + with_rpe=True, + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(2, ), + num_channels=(64, ), + num_heads=[2], + mlp_ratios=[4]), + stage2=dict( + num_modules=1, + num_branches=2, + block='HRFORMERBLOCK', + num_blocks=(2, 2), + num_channels=(78, 156), + num_heads=[2, 4], + mlp_ratios=[4, 4], + window_sizes=[7, 7]), + stage3=dict( + num_modules=4, + num_branches=3, + block='HRFORMERBLOCK', + num_blocks=(2, 2, 2), + num_channels=(78, 156, 312), + num_heads=[2, 4, 8], + mlp_ratios=[4, 4, 4], + window_sizes=[7, 7, 7]), + stage4=dict( + num_modules=2, + num_branches=4, + block='HRFORMERBLOCK', + num_blocks=(2, 2, 2, 2), + num_channels=(78, 156, 312, 624), + num_heads=[2, 4, 8, 16], + mlp_ratios=[4, 4, 4, 4], + window_sizes=[7, 7, 7, 7])), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrformer_base-32815020_20220226.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=78, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator + +# fp16 settings +fp16 = dict(loss_scale='dynamic') diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..b055be5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py @@ -0,0 +1,174 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=5e-4, + betas=(0.9, 0.999), + weight_decay=0.01, + ), + paramwise_cfg=dict( + custom_keys={'relative_position_bias_table': dict(decay_mult=0.)})) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRFormer', + in_channels=3, + norm_cfg=norm_cfg, + extra=dict( + drop_path_rate=0.2, + with_rpe=True, + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(2, ), + num_channels=(64, ), + num_heads=[2], + mlp_ratios=[4]), + stage2=dict( + num_modules=1, + num_branches=2, + block='HRFORMERBLOCK', + num_blocks=(2, 2), + num_channels=(78, 156), + num_heads=[2, 4], + mlp_ratios=[4, 4], + window_sizes=[7, 7]), + stage3=dict( + num_modules=4, + num_branches=3, + block='HRFORMERBLOCK', + num_blocks=(2, 2, 2), + num_channels=(78, 156, 312), + num_heads=[2, 4, 8], + mlp_ratios=[4, 4, 4], + window_sizes=[7, 7, 7]), + stage4=dict( + num_modules=2, + num_branches=4, + block='HRFORMERBLOCK', + num_blocks=(2, 2, 2, 2), + num_channels=(78, 156, 312, 624), + num_heads=[2, 4, 8, 16], + mlp_ratios=[4, 4, 4, 4], + window_sizes=[7, 7, 7, 7])), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrformer_base-32815020_20220226.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=78, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator + +# fp16 settings +fp16 = dict(loss_scale='dynamic') diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..e283ae3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py @@ -0,0 +1,174 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=5e-4, + betas=(0.9, 0.999), + weight_decay=0.01, + ), + paramwise_cfg=dict( + custom_keys={'relative_position_bias_table': dict(decay_mult=0.)})) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRFormer', + in_channels=3, + norm_cfg=norm_cfg, + extra=dict( + drop_path_rate=0.1, + with_rpe=True, + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(2, ), + num_channels=(64, ), + num_heads=[2], + num_mlp_ratios=[4]), + stage2=dict( + num_modules=1, + num_branches=2, + block='HRFORMERBLOCK', + num_blocks=(2, 2), + num_channels=(32, 64), + num_heads=[1, 2], + mlp_ratios=[4, 4], + window_sizes=[7, 7]), + stage3=dict( + num_modules=4, + num_branches=3, + block='HRFORMERBLOCK', + num_blocks=(2, 2, 2), + num_channels=(32, 64, 128), + num_heads=[1, 2, 4], + mlp_ratios=[4, 4, 4], + window_sizes=[7, 7, 7]), + stage4=dict( + num_modules=2, + num_branches=4, + block='HRFORMERBLOCK', + num_blocks=(2, 2, 2, 2), + num_channels=(32, 64, 128, 256), + num_heads=[1, 2, 4, 8], + mlp_ratios=[4, 4, 4, 4], + window_sizes=[7, 7, 7, 7])), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrformer_small-09516375_20220226.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator + +# fp16 settings +fp16 = dict(loss_scale='dynamic') diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..323a168 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py @@ -0,0 +1,174 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=5e-4, + betas=(0.9, 0.999), + weight_decay=0.01, + ), + paramwise_cfg=dict( + custom_keys={'relative_position_bias_table': dict(decay_mult=0.)})) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRFormer', + in_channels=3, + norm_cfg=norm_cfg, + extra=dict( + drop_path_rate=0.1, + with_rpe=True, + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(2, ), + num_channels=(64, ), + num_heads=[2], + num_mlp_ratios=[4]), + stage2=dict( + num_modules=1, + num_branches=2, + block='HRFORMERBLOCK', + num_blocks=(2, 2), + num_channels=(32, 64), + num_heads=[1, 2], + mlp_ratios=[4, 4], + window_sizes=[7, 7]), + stage3=dict( + num_modules=4, + num_branches=3, + block='HRFORMERBLOCK', + num_blocks=(2, 2, 2), + num_channels=(32, 64, 128), + num_heads=[1, 2, 4], + mlp_ratios=[4, 4, 4], + window_sizes=[7, 7, 7]), + stage4=dict( + num_modules=2, + num_branches=4, + block='HRFORMERBLOCK', + num_blocks=(2, 2, 2, 2), + num_channels=(32, 64, 128, 256), + num_heads=[1, 2, 4, 8], + mlp_ratios=[4, 4, 4, 4], + window_sizes=[7, 7, 7, 7])), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrformer_small-09516375_20220226.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator + +# fp16 settings +fp16 = dict(loss_scale='dynamic') diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..7d89ef5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..fdb2bae --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py new file mode 100644 index 0000000..8499f40 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py @@ -0,0 +1,221 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=3)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=19, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root='data/aic/', + data_mode=data_mode, + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py new file mode 100644 index 0000000..5ac097c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py @@ -0,0 +1,187 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root='data/aic/', + data_mode=data_mode, + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..22a9627 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py @@ -0,0 +1,165 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/' + 'body_2d_keypoint/topdown_heatmap/coco/' + 'td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict( + type='CoarseDropout', + max_holes=8, + max_height=40, + max_width=40, + min_holes=1, + min_height=10, + min_width=10, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..a11db9e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..b2bb6e4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(288, 384), + heatmap_size=(72, 96), + sigma=3, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..50188c5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py @@ -0,0 +1,7 @@ +_base_ = ['./td-hm_hrnet-w32_8xb64-210e_coco-256x192.py'] + +# fp16 settings +optim_wrapper = dict( + type='AmpOptimWrapper', + loss_scale='dynamic', +) diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..5157242 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py @@ -0,0 +1,162 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/' + 'body_2d_keypoint/topdown_heatmap/coco/' + 'td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict( + type='GridDropout', + unit_size_min=10, + unit_size_max=40, + random_offset=True, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..ea82efc --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py @@ -0,0 +1,153 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/' + 'body_2d_keypoint/topdown_heatmap/coco/' + 'td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..54ae3d9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..3344529 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..f29c8a2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py @@ -0,0 +1,155 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='UDPHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2, + heatmap_type='combined') + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=3 * 17, + deconv_out_channels=None, + loss=dict(type='CombinedTargetMSELoss', use_target_weight=True), + decoder=codec), + train_cfg=dict(compute_acc=False), + test_cfg=dict( + flip_test=True, + flip_mode='udp_combined', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..5ddd421 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..755f236 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..80ffe8a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..04cd41c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(288, 384), + heatmap_size=(72, 96), + sigma=3, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..6cd31a7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..f9edf38 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..f801f7a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py @@ -0,0 +1,140 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='LiteHRNet', + in_channels=3, + extra=dict( + stem=dict(stem_channels=32, out_channels=32, expand_ratio=1), + num_stages=3, + stages_spec=dict( + num_modules=(2, 4, 2), + num_branches=(2, 3, 4), + num_blocks=(2, 2, 2), + module_type=('LITE', 'LITE', 'LITE'), + with_fuse=(True, True, True), + reduce_ratios=(8, 8, 8), + num_channels=( + (40, 80), + (40, 80, 160), + (40, 80, 160, 320), + )), + with_head=True, + )), + head=dict( + type='HeatmapHead', + in_channels=40, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..dd59f59 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py @@ -0,0 +1,140 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='LiteHRNet', + in_channels=3, + extra=dict( + stem=dict(stem_channels=32, out_channels=32, expand_ratio=1), + num_stages=3, + stages_spec=dict( + num_modules=(2, 4, 2), + num_branches=(2, 3, 4), + num_blocks=(2, 2, 2), + module_type=('LITE', 'LITE', 'LITE'), + with_fuse=(True, True, True), + reduce_ratios=(8, 8, 8), + num_channels=( + (40, 80), + (40, 80, 160), + (40, 80, 160, 320), + )), + with_head=True, + )), + head=dict( + type='HeatmapHead', + in_channels=40, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..8b69bbc --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py @@ -0,0 +1,140 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='LiteHRNet', + in_channels=3, + extra=dict( + stem=dict(stem_channels=32, out_channels=32, expand_ratio=1), + num_stages=3, + stages_spec=dict( + num_modules=(3, 8, 3), + num_branches=(2, 3, 4), + num_blocks=(2, 2, 2), + module_type=('LITE', 'LITE', 'LITE'), + with_fuse=(True, True, True), + reduce_ratios=(8, 8, 8), + num_channels=( + (40, 80), + (40, 80, 160), + (40, 80, 160, 320), + )), + with_head=True, + )), + head=dict( + type='HeatmapHead', + in_channels=40, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..2aa7f3c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py @@ -0,0 +1,140 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='LiteHRNet', + in_channels=3, + extra=dict( + stem=dict(stem_channels=32, out_channels=32, expand_ratio=1), + num_stages=3, + stages_spec=dict( + num_modules=(3, 8, 3), + num_branches=(2, 3, 4), + num_blocks=(2, 2, 2), + module_type=('LITE', 'LITE', 'LITE'), + with_fuse=(True, True, True), + reduce_ratios=(8, 8, 8), + num_channels=( + (40, 80), + (40, 80, 160), + (40, 80, 160, 320), + )), + with_head=True, + )), + head=dict( + type='HeatmapHead', + in_channels=40, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..1601819 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MobileNetV2', + widen_factor=1., + out_indices=(7, ), + init_cfg=dict( + type='Pretrained', + checkpoint='mmcls://mobilenet_v2', + )), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..fcf4ee7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MobileNetV2', + widen_factor=1., + out_indices=(7, ), + init_cfg=dict( + type='Pretrained', + checkpoint='mmcls://mobilenet_v2', + )), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..b1dbb18 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py @@ -0,0 +1,152 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach. +kernel_sizes = [11, 9, 7, 5] +codec = [ + dict( + type='MegviiHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + kernel_size=kernel_size) for kernel_size in kernel_sizes +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MSPN', + unit_channels=256, + num_stages=1, + num_units=4, + num_blocks=[3, 4, 6, 3], + norm_cfg=dict(type='BN'), + init_cfg=dict( + type='Pretrained', + checkpoint='torchvision://resnet50', + )), + head=dict( + type='MSPNHead', + out_shape=(64, 48), + unit_channels=256, + out_channels=17, + num_stages=1, + num_units=4, + norm_cfg=dict(type='BN'), + # each sub list is for a stage + # and each element in each list is for a unit + level_indices=[0, 1, 2, 3], + loss=[ + dict( + type='KeypointMSELoss', + use_target_weight=True, + loss_weight=0.25) + ] * 3 + [ + dict( + type='KeypointOHKMMSELoss', + use_target_weight=True, + loss_weight=1.) + ], + decoder=codec[-1]), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='GenerateTarget', multilevel=True, encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..4a8704a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py @@ -0,0 +1,127 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='PyramidVisionTransformer', + num_layers=[3, 4, 6, 3], + init_cfg=dict( + type='Pretrained', + checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_small.pth'), + ), + neck=dict(type='FeatureMapProcessor', select_index=3), + head=dict( + type='HeatmapHead', + in_channels=512, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..dd7b5d9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py @@ -0,0 +1,128 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='PyramidVisionTransformerV2', + embed_dims=64, + num_layers=[3, 4, 6, 3], + init_cfg=dict( + type='Pretrained', + checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b2.pth'), + ), + neck=dict(type='FeatureMapProcessor', select_index=3), + head=dict( + type='HeatmapHead', + in_channels=512, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..25ebf01 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..29f6555 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..3ef9880 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..c8ce6c1 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(288, 384), + heatmap_size=(72, 96), + sigma=3, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..b2307b2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..eae41ac --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..4d55253 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..524d999 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py @@ -0,0 +1,126 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(288, 384), + heatmap_size=(72, 96), + sigma=3, + unbiased=True, + blur_kernel_size=17) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..e00887f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..91e8ef2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..07b6031 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..7678941 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(288, 384), + heatmap_size=(72, 96), + sigma=3, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..57c8374 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py @@ -0,0 +1,7 @@ +_base_ = ['./td-hm_res50_8xb64-210e_coco-256x192.py'] + +# fp16 settings +optim_wrapper = dict( + type='AmpOptimWrapper', + loss_scale='dynamic', +) diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..05be08c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..fb08555 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py new file mode 100644 index 0000000..48e6992 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=128) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + depth=200, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest200'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..85466b7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + depth=200, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest200'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py new file mode 100644 index 0000000..5279a43 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=128) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + depth=269, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest269'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..84eb68b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + depth=269, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest269'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..a9fb575 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..b6e3dcb --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..5d90ab7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNetV1d', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet101_v1d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..f2a66df --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNetV1d', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet101_v1d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..4821267 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNetV1d', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet152_v1d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py new file mode 100644 index 0000000..27281df --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=384) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNetV1d', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet152_v1d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=48, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..e418369 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNetV1d', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet50_v1d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..59d3ba6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNetV1d', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet50_v1d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..eedb64c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeXt', + depth=101, + init_cfg=dict( + type='Pretrained', checkpoint='mmcls://resnext101_32x4d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..42487c0 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeXt', + depth=101, + init_cfg=dict( + type='Pretrained', checkpoint='mmcls://resnext101_32x4d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..82cfeae --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeXt', + depth=152, + init_cfg=dict( + type='Pretrained', checkpoint='mmcls://resnext152_32x4d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py new file mode 100644 index 0000000..2503796 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=384) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeXt', + depth=152, + init_cfg=dict( + type='Pretrained', checkpoint='mmcls://resnext152_32x4d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=48, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..2513248 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeXt', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnext50_32x4d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..756010c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeXt', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnext50_32x4d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..7641846 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-2, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 190, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach. +kernel_sizes = [11, 9, 7, 5] +codec = [ + dict( + type='MegviiHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + kernel_size=kernel_size) for kernel_size in kernel_sizes +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='RSN', + unit_channels=256, + num_stages=1, + num_units=4, + num_blocks=[2, 2, 2, 2], + num_steps=4, + norm_cfg=dict(type='BN'), + ), + head=dict( + type='MSPNHead', + out_shape=(64, 48), + unit_channels=256, + out_channels=17, + num_stages=1, + num_units=4, + norm_cfg=dict(type='BN'), + # each sub list is for a stage + # and each element in each list is for a unit + level_indices=[0, 1, 2, 3], + loss=[ + dict( + type='KeypointMSELoss', + use_target_weight=True, + loss_weight=0.25) + ] * 3 + [ + dict( + type='KeypointOHKMMSELoss', + use_target_weight=True, + loss_weight=1.) + ], + decoder=codec[-1]), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='GenerateTarget', multilevel=True, encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none') +test_evaluator = val_evaluator + +# fp16 settings +fp16 = dict(loss_scale='dynamic') diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..b144cf6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach. +kernel_sizes = [11, 9, 7, 5] +codec = [ + dict( + type='MegviiHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + kernel_size=kernel_size) for kernel_size in kernel_sizes +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='RSN', + unit_channels=256, + num_stages=1, + num_units=4, + num_blocks=[3, 4, 6, 3], + num_steps=4, + norm_cfg=dict(type='BN'), + ), + head=dict( + type='MSPNHead', + out_shape=(64, 48), + unit_channels=256, + out_channels=17, + num_stages=1, + num_units=4, + norm_cfg=dict(type='BN'), + # each sub list is for a stage + # and each element in each list is for a unit + level_indices=[0, 1, 2, 3], + loss=[ + dict( + type='KeypointMSELoss', + use_target_weight=True, + loss_weight=0.25) + ] * 3 + [ + dict( + type='KeypointOHKMMSELoss', + use_target_weight=True, + loss_weight=1.) + ], + decoder=codec[-1]), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='GenerateTarget', multilevel=True, encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec[0]['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none') +test_evaluator = val_evaluator + +# fp16 settings +fp16 = dict(loss_scale='dynamic') diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..d8e49fe --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SCNet', + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/scnet101-94250a77.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=1, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py new file mode 100644 index 0000000..3281e4a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SCNet', + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/scnet101-94250a77.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=48, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..41071b6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SCNet', + depth=50, + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/scnet50-7ef0a199.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=1, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..7355333 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SCNet', + depth=50, + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/scnet50-7ef0a199.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..f1bb265 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SEResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..d679fc9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SEResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..721d4b8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SEResNet', + depth=152, + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py new file mode 100644 index 0000000..94ee1e9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=384) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SEResNet', + depth=152, + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=48, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..6ac46fd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SEResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..8860772 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SEResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..ec7d34b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ShuffleNetV1', + groups=3, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v1'), + ), + head=dict( + type='HeatmapHead', + in_channels=960, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..cff10f4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ShuffleNetV1', + groups=3, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v1'), + ), + head=dict( + type='HeatmapHead', + in_channels=960, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..59c8109 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ShuffleNetV2', + widen_factor=1.0, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v2'), + ), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..d65aa54 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ShuffleNetV2', + widen_factor=1.0, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v2'), + ), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..c29257b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py @@ -0,0 +1,139 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(3, ), + with_cp=False, + convert_weights=True, + init_cfg=dict( + type='Pretrained', + checkpoint='https://github.com/SwinTransformer/storage/releases/' + 'download/v1.0.0/swin_base_patch4_window7_224_22k.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..4bc632a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py @@ -0,0 +1,139 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=12, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(3, ), + with_cp=False, + convert_weights=True, + init_cfg=dict( + type='Pretrained', + checkpoint='https://github.com/SwinTransformer/storage/releases/' + 'download/v1.0.0/swin_base_patch4_window12_384_22k.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..3294263 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py @@ -0,0 +1,148 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=5e-4, + betas=(0.9, 0.999), + weight_decay=0.01, + ), + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + })) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SwinTransformer', + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.5, + patch_norm=True, + out_indices=(3, ), + with_cp=False, + convert_weights=True, + init_cfg=dict( + type='Pretrained', + checkpoint='https://github.com/SwinTransformer/storage/releases/' + 'download/v1.0.0/swin_base_patch4_window7_224_22k.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1536, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py new file mode 100644 index 0000000..643cbc2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py @@ -0,0 +1,148 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=5e-4, + betas=(0.9, 0.999), + weight_decay=0.01, + ), + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + })) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SwinTransformer', + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.5, + patch_norm=True, + out_indices=(3, ), + with_cp=False, + convert_weights=True, + init_cfg=dict( + type='Pretrained', + checkpoint='https://github.com/SwinTransformer/storage/releases/' + 'download/v1.0.0/swin_base_patch4_window12_384_22k.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1536, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py new file mode 100644 index 0000000..9c4ab23 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py @@ -0,0 +1,139 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(3, ), + with_cp=False, + convert_weights=True, + init_cfg=dict( + type='Pretrained', + checkpoint='https://github.com/SwinTransformer/storage/releases/' + 'download/v1.0.0/swin_tiny_patch4_window7_224.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..f50c2b4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='VGG', + depth=16, + norm_cfg=dict(type='BN'), + init_cfg=dict(type='Pretrained', checkpoint='mmcls://vgg16_bn'), + ), + head=dict( + type='HeatmapHead', + in_channels=512, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..7be5676 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ViPNAS_MobileNetV3'), + head=dict( + type='ViPNASHead', + in_channels=160, + out_channels=17, + deconv_out_channels=(160, 160, 160), + deconv_num_groups=(160, 160, 160), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..9477532 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ViPNAS_ResNet', depth=50), + head=dict( + type='ViPNASHead', + in_channels=608, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.md new file mode 100644 index 0000000..a03c8fc --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.md @@ -0,0 +1,39 @@ + + +
+VGG (ICLR'2015) + +```bibtex +@article{simonyan2014very, + title={Very deep convolutional networks for large-scale image recognition}, + author={Simonyan, Karen and Zisserman, Andrew}, + journal={arXiv preprint arXiv:1409.1556}, + year={2014} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [vgg](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py) | 256x192 | 0.699 | 0.890 | 0.769 | 0.754 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vgg/vgg16_bn_coco_256x192-7e7c58d6_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vgg/vgg16_bn_coco_256x192_20210517.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.yml new file mode 100644 index 0000000..6de1830 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.yml @@ -0,0 +1,19 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - VGG + Training Data: COCO + Name: td-hm_vgg16-bn_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.699 + AP@0.5: 0.89 + AP@0.75: 0.769 + AR: 0.754 + AR@0.5: 0.927 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/vgg/vgg16_bn_coco_256x192-7e7c58d6_20210517.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md new file mode 100644 index 0000000..e138d21 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md @@ -0,0 +1,40 @@ + + +
+ViPNAS (CVPR'2021) + +```bibtex +@article{xu2021vipnas, + title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search}, + author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + year={2021} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [S-ViPNAS-MobileNetV3](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py) | 256x192 | 0.700 | 0.887 | 0.783 | 0.758 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192-e0987441_20221010.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192_20221010.log) | +| [S-ViPNAS-Res50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.711 | 0.894 | 0.787 | 0.769 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192-35d4bff9_20220917.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192_20220917.log) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.yml new file mode 100644 index 0000000..66f1819 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.yml @@ -0,0 +1,40 @@ +Collections: +- Name: ViPNAS + Paper: + Title: 'ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search' + URL: https://arxiv.org/abs/2105.10154 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/vipnas.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py + In Collection: ViPNAS + Metadata: + Architecture: &id001 + - ViPNAS + Training Data: COCO + Name: td-hm_vipnas-mbv3_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.7 + AP@0.5: 0.887 + AP@0.75: 0.783 + AR: 0.758 + AR@0.5: 0.929 + Task: Body 2D Keypoint + Weights: (https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192-e0987441_20221010.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py + In Collection: ViPNAS + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_vipnas-res50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.711 + AP@0.5: 0.894 + AP@0.75: 0.787 + AR: 0.769 + AR@0.5: 0.934 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192-35d4bff9_20220917.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md new file mode 100644 index 0000000..b29fd86 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md @@ -0,0 +1,61 @@ +To utilize ViTPose, you'll need to have [MMPreTrain](https://github.com/open-mmlab/mmpretrain). To install the required version, run the following command: + +```shell +mim install 'mmpretrain>=1.0.0' +``` + + + +
+ViTPose (NeurIPS'2022) + +```bibtex +@inproceedings{ + xu2022vitpose, + title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation}, + author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao}, + booktitle={Advances in Neural Information Processing Systems}, + year={2022}, +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +> With classic decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.739 | 0.903 | 0.816 | 0.792 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.json) | +| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.905 | 0.829 | 0.810 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.json) | +| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py) | 256x192 | 0.782 | 0.914 | 0.850 | 0.834 | 0.952 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.json) | +| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.788 | 0.917 | 0.855 | 0.839 | 0.954 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.json) | +| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.790 | 0.916 | 0.857 | 0.840 | 0.953 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_3rdparty_coco-256x192-5b738c8e_20230314.pth) | - | + +*Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose). The config files of these models are only for validation.* + +> With simple decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.736 | 0.900 | 0.811 | 0.790 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192-4c101a76_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192-4c101a76_20230314.json) | +| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.756 | 0.906 | 0.826 | 0.809 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192-0b8234ea_20230407.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192-0b8234ea_20230407.json) | +| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.780 | 0.914 | 0.851 | 0.833 | 0.952 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192-3a7ee9e1_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192-3a7ee9e1_20230314.json) | +| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.789 | 0.916 | 0.856 | 0.839 | 0.953 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192-ffd48c05_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192-ffd48c05_20230314.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml new file mode 100644 index 0000000..fd70420 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml @@ -0,0 +1,155 @@ +Collections: +- Name: ViTPose + Paper: + Title: 'ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation' + URL: https://arxiv.org/abs/2204.12484 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/vitpose.md + Metadata: + Training Resources: 8x A100 GPUs +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Metadata: + Architecture: &id001 + - ViTPose + - Classic Head + Model Size: Small + Training Data: COCO + Name: td-hm_ViTPose-small_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.739 + AP@0.5: 0.903 + AP@0.75: 0.816 + AR: 0.792 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Metadata: + Architecture: *id001 + Model Size: Base + Training Data: COCO + Name: td-hm_ViTPose-base_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.757 + AP@0.5: 0.905 + AP@0.75: 0.829 + AR: 0.81 + AR@0.5: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Metadata: + Architecture: *id001 + Model Size: Large + Training Data: COCO + Name: td-hm_ViTPose-large_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.782 + AP@0.5: 0.914 + AP@0.75: 0.850 + AR: 0.834 + AR@0.5: 0.952 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Metadata: + Architecture: *id001 + Model Size: Huge + Training Data: COCO + Name: td-hm_ViTPose-huge_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.788 + AP@0.5: 0.917 + AP@0.75: 0.855 + AR: 0.839 + AR@0.5: 0.954 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Alias: vitpose-s + Metadata: + Architecture: &id002 + - ViTPose + - Simple Head + Model Size: Small + Training Data: COCO + Name: td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.736 + AP@0.5: 0.900 + AP@0.75: 0.811 + AR: 0.790 + AR@0.5: 0.940 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192-4c101a76_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Alias: + - vitpose + - vitpose-b + Metadata: + Architecture: *id002 + Model Size: Base + Training Data: COCO + Name: td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.756 + AP@0.5: 0.906 + AP@0.75: 0.826 + AR: 0.809 + AR@0.5: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192-0b8234ea_20230407.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Alias: vitpose-l + Metadata: + Architecture: *id002 + Model Size: Large + Training Data: COCO + Name: td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.780 + AP@0.5: 0.914 + AP@0.75: 0.851 + AR: 0.833 + AR@0.5: 0.952 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192-3a7ee9e1_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Alias: vitpose-h + Metadata: + Architecture: *id002 + Model Size: Huge + Training Data: COCO + Name: td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.789 + AP@0.5: 0.916 + AP@0.75: 0.856 + AR: 0.839 + AR@0.5: 0.953 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192-ffd48c05_20230314.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py new file mode 100644 index 0000000..1edee28 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py @@ -0,0 +1,216 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=14, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='crowdpose/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'crowdpose/annotations/mmpose_crowdpose_test.json', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.md new file mode 100644 index 0000000..734e210 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.md @@ -0,0 +1,56 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector + +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [pose_cspnext_m](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py) | 256x192 | 0.662 | 0.821 | 0.723 | 0.759 | 0.675 | 0.539 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-crowdpose_pt-in1k_210e-256x192-f591079f_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-crowdpose_pt-in1k_210e-256x192-f591079f_20230123.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.yml new file mode 100644 index 0000000..6201813 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.yml @@ -0,0 +1,20 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py + In Collection: UDP + Metadata: + Architecture: + - UDP + - CSPNeXt + Training Data: CrowdPose + Name: cspnext-m_udp_8xb64-210e_crowpose-256x192 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.662 + AP (E): 0.759 + AP (H): 0.539 + AP (M): 0.675 + AP@0.5: 0.821 + AP@0.75: 0.723 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-crowdpose_pt-in1k_210e-256x192-f591079f_20230123.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md new file mode 100644 index 0000000..5fdb1aa --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md @@ -0,0 +1,38 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector + +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.675 | 0.825 | 0.729 | 0.770 | 0.687 | 0.553 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192-960be101_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192_20201227.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.yml new file mode 100644 index 0000000..f090812 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.yml @@ -0,0 +1,19 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py + In Collection: HRNet + Metadata: + Architecture: + - HRNet + Training Data: CrowdPose + Name: td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.675 + AP (E): 0.77 + AP (H): 0.553 + AP (M): 0.687 + AP@0.5: 0.825 + AP@0.75: 0.729 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192-960be101_20201227.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md new file mode 100644 index 0000000..d987f26 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md @@ -0,0 +1,58 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector + +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.637 | 0.808 | 0.692 | 0.738 | 0.650 | 0.506 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192-c6a526b6_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192_20201227.log.json) | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.647 | 0.810 | 0.703 | 0.745 | 0.658 | 0.521 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192-8f5870f4_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192_20201227.log.json) | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py) | 320x256 | 0.661 | 0.821 | 0.714 | 0.759 | 0.672 | 0.534 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256-c88c512a_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256_20201227.log.json) | +| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.656 | 0.818 | 0.712 | 0.754 | 0.666 | 0.533 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192-dbd49aba_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192_20201227.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.yml new file mode 100644 index 0000000..15802eb --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.yml @@ -0,0 +1,71 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: CrowdPose + Name: td-hm_res50_8xb64-210e_crowdpose-256x192 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.637 + AP (E): 0.738 + AP (H): 0.506 + AP (M): 0.65 + AP@0.5: 0.808 + AP@0.75: 0.692 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192-c6a526b6_20201227.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: td-hm_res101_8xb64-210e_crowdpose-256x192 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.647 + AP (E): 0.745 + AP (H): 0.521 + AP (M): 0.658 + AP@0.5: 0.81 + AP@0.75: 0.703 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192-8f5870f4_20201227.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: td-hm_res101_8xb64-210e_crowdpose-320x256 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.661 + AP (E): 0.759 + AP (H): 0.534 + AP (M): 0.672 + AP@0.5: 0.821 + AP@0.75: 0.714 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256-c88c512a_20201227.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: td-hm_res152_8xb64-210e_crowdpose-256x192 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.656 + AP (E): 0.754 + AP (H): 0.533 + AP (M): 0.666 + AP@0.5: 0.818 + AP@0.75: 0.712 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192-dbd49aba_20201227.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py new file mode 100644 index 0000000..c5ec67a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py @@ -0,0 +1,152 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=14, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'topdown' +data_root = 'data/crowdpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_test.json', + bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/mmpose_crowdpose_test.json', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py new file mode 100644 index 0000000..ef78bbc --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py @@ -0,0 +1,123 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=14, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'topdown' +data_root = 'data/crowdpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_test.json', + bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/mmpose_crowdpose_test.json', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py new file mode 100644 index 0000000..4ffb602 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py @@ -0,0 +1,123 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 320), heatmap_size=(64, 80), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=14, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'topdown' +data_root = 'data/crowdpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_test.json', + bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/mmpose_crowdpose_test.json', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py new file mode 100644 index 0000000..d53e2d1 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py @@ -0,0 +1,123 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=14, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'topdown' +data_root = 'data/crowdpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_test.json', + bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/mmpose_crowdpose_test.json', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py new file mode 100644 index 0000000..2ae99ce --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py @@ -0,0 +1,123 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=14, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'topdown' +data_root = 'data/crowdpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_test.json', + bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/mmpose_crowdpose_test.json', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.md new file mode 100644 index 0000000..524164e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.md @@ -0,0 +1,38 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+ExLPose (2023) + +```bibtex +@inproceedings{ExLPose_2023_CVPR, + title={Human Pose Estimation in Extremely Low-Light Conditions}, + author={Sohyun Lee, Jaesung Rim, Boseung Jeong, Geonu Kim, ByungJu Woo, Haechan Lee, Sunghyun Cho, Suha Kwak}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2023} +} +``` + +
+ +Results on ExLPose-LLA val set with ground-truth bounding boxes + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py) | 256x192 | 0.401 | 0.64 | 0.40 | 0.452 | 0.693 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-ll-256x192.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-ll-256x192.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.yml new file mode 100644 index 0000000..b24cbdd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.yml @@ -0,0 +1,18 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py + In Collection: HRNet + Metadata: + Architecture: + - HRNet + Training Data: ExLPose-LL + Name: td-hm_hrnet-w32_8xb64-210e_exlpose-256x192 + Results: + - Dataset: ExLPose + Metrics: + AP: 0.401 + AP@0.5: 0.64 + AP@0.75: 0.40 + AR: 0.452 + AR@0.5: 0.693 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-ll-256x192.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py new file mode 100644 index 0000000..95cfc6f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py @@ -0,0 +1,149 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=14, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'ExlposeDataset' +data_mode = 'topdown' +data_root = 'data/ExLPose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ExLPose/ExLPose_train_LL.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ExLPose/ExLPose_test_LL-A.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ExLPose/ExLPose_test_LL-A.json', + use_area=False) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.md new file mode 100644 index 0000000..71b825e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.md @@ -0,0 +1,80 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +
+Human-Art (CVPR'2023) + +```bibtex +@inproceedings{ju2023humanart, + title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes}, + author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), + year={2023}} +``` + +
+ +Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset + +> With classic decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | 256x192 | 0.252 | 0.397 | 0.255 | 0.321 | 0.485 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) | +| [pose_hrnet_w32-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py) | 256x192 | 0.399 | 0.545 | 0.420 | 0.466 | 0.613 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.json) | +| [pose_hrnet_w48-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py) | 256x192 | 0.271 | 0.413 | 0.277 | 0.339 | 0.499 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192_20220913.log) | +| [pose_hrnet_w48-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py) | 256x192 | 0.417 | 0.553 | 0.442 | 0.481 | 0.617 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.json) | + +Results on Human-Art validation dataset with ground-truth bounding-box + +> With classic decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | 256x192 | 0.533 | 0.771 | 0.562 | 0.574 | 0.792 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) | +| [pose_hrnet_w32-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py) | 256x192 | 0.754 | 0.906 | 0.812 | 0.783 | 0.916 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.json) | +| [pose_hrnet_w48-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py) | 256x192 | 0.557 | 0.782 | 0.593 | 0.595 | 0.804 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192_20220913.log) | +| [pose_hrnet_w48-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py) | 256x192 | 0.769 | 0.906 | 0.825 | 0.796 | 0.919 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.json) | + +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +> With classic decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | 256x192 | 0.749 | 0.906 | 0.821 | 0.804 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) | +| [pose_hrnet_w32-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py) | 256x192 | 0.741 | 0.902 | 0.814 | 0.795 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.json) | +| [pose_hrnet_w48-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py) | 256x192 | 0.756 | 0.908 | 0.826 | 0.809 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192_20220913.log) | +| [pose_hrnet_w48-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py) | 256x192 | 0.751 | 0.905 | 0.822 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.yml new file mode 100644 index 0000000..d49a662 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.yml @@ -0,0 +1,74 @@ +Collections: +- Name: HRNet + Paper: + Title: Deep high-resolution representation learning for human pose estimation + URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrnet.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py + In Collection: HRNet + Metadata: + Architecture: &id001 + - HRNet + Training Data: &id002 + - COCO + - Human-Art + Name: td-hm_hrnet-w32_8xb64-210e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.741 + AP@0.5: 0.902 + AP@0.75: 0.814 + AR: 0.795 + AR@0.5: 0.941 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.399 + AP@0.5: 0.545 + AP@0.75: 0.420 + AR: 0.466 + AR@0.5: 0.613 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.754 + AP@0.5: 0.906 + AP@0.75: 0.812 + AR: 0.783 + AR@0.5: 0.916 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: td-hm_hrnet-w48_8xb32-210e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.751 + AP@0.5: 0.905 + AP@0.75: 0.822 + AR: 0.805 + AR@0.5: 0.943 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.417 + AP@0.5: 0.553 + AP@0.75: 0.442 + AR: 0.481 + AR@0.5: 0.617 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.769 + AP@0.5: 0.906 + AP@0.75: 0.825 + AR: 0.796 + AR@0.5: 0.919 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py new file mode 100644 index 0000000..c28de59 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.75, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch='base', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.3, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_base.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/' +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + bbox_file=f'{data_root}HumanArt/person_detection_results/' + 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py new file mode 100644 index 0000000..92a51d1 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=32, + layer_decay_rate=0.85, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch='huge', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.55, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_huge.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/' +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + bbox_file=f'{data_root}HumanArt/person_detection_results/' + 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-large_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-large_8xb64-210e_humanart-256x192.py new file mode 100644 index 0000000..ec7edd2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-large_8xb64-210e_humanart-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=24, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch='large', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.5, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_large.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/' +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + bbox_file=f'{data_root}HumanArt/person_detection_results/' + 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py new file mode 100644 index 0000000..ce27e97 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py @@ -0,0 +1,155 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch={ + 'embed_dims': 384, + 'num_layers': 12, + 'num_heads': 12, + 'feedforward_channels': 384 * 4 + }, + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.1, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_small.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/' +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + bbox_file=f'{data_root}HumanArt/person_detection_results/' + 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py new file mode 100644 index 0000000..00bfd37 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' +data_root = 'data/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + bbox_file=f'{data_root}HumanArt/person_detection_results/' + 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py new file mode 100644 index 0000000..21269e4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'HumanArtDataset' +data_mode = 'topdown' +data_root = 'data/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart_coco.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/validation_humanart.json', + bbox_file=f'{data_root}HumanArt/person_detection_results/' + 'HumanArt_validation_detections_AP_H_56_person.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'HumanArt/annotations/validation_humanart.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.md new file mode 100644 index 0000000..dc0e52f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.md @@ -0,0 +1,97 @@ +To utilize ViTPose, you'll need to have [MMPreTrain](https://github.com/open-mmlab/mmpretrain). To install the required version, run the following command: + +```shell +mim install 'mmpretrain>=1.0.0' +``` + + + +
+ +ViTPose (NeurIPS'2022) + +```bibtex +@inproceedings{ + xu2022vitpose, + title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation}, + author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao}, + booktitle={Advances in Neural Information Processing Systems}, + year={2022}, +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +
+Human-Art (CVPR'2023) + +```bibtex +@inproceedings{ju2023humanart, + title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes}, + author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), + year={2023}} +``` + +
+ +Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset + +> With classic decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [ViTPose-S-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.228 | 0.371 | 0.229 | 0.298 | 0.467 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.json) | +| [ViTPose-S-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py) | 256x192 | 0.381 | 0.532 | 0.405 | 0.448 | 0.602 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.json) | +| [ViTPose-B-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.270 | 0.423 | 0.272 | 0.340 | 0.510 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.json) | +| [ViTPose-B-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.410 | 0.549 | 0.434 | 0.475 | 0.615 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.json) | +| [ViTPose-L-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.342 | 0.498 | 0.357 | 0.413 | 0.577 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.json) | +| [ViTPose-L-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.459 | 0.592 | 0.487 | 0.525 | 0.656 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.json) | +| [ViTPose-H-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.377 | 0.541 | 0.391 | 0.447 | 0.615 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.json) | +| [ViTPose-H-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py) | 256x192 | 0.468 | 0.594 | 0.498 | 0.534 | 0.655 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.json) | + +Results on Human-Art validation dataset with ground-truth bounding-box + +> With classic decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [ViTPose-S-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.507 | 0.758 | 0.531 | 0.551 | 0.780 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.json) | +| [ViTPose-S-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py) | 256x192 | 0.738 | 0.905 | 0.802 | 0.768 | 0.911 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.json) | +| [ViTPose-B-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.555 | 0.782 | 0.590 | 0.599 | 0.809 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.json) | +| [ViTPose-B-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.759 | 0.905 | 0.823 | 0.790 | 0.917 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.json) | +| [ViTPose-L-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.637 | 0.838 | 0.689 | 0.677 | 0.859 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.json) | +| [ViTPose-L-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.789 | 0.916 | 0.845 | 0.819 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.json) | +| [ViTPose-H-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.665 | 0.860 | 0.715 | 0.701 | 0.871 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.json) | +| [ViTPose-H-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py) | 256x192 | 0.800 | 0.926 | 0.855 | 0.828 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.json) | + +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +> With classic decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [ViTPose-S-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.739 | 0.903 | 0.816 | 0.792 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.json) | +| [ViTPose-S-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py) | 256x192 | 0.737 | 0.902 | 0.811 | 0.792 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.json) | +| [ViTPose-B-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.905 | 0.829 | 0.810 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.json) | +| [ViTPose-B-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.758 | 0.906 | 0.829 | 0.812 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.json) | +| [ViTPose-L-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py) | 256x192 | 0.782 | 0.914 | 0.850 | 0.834 | 0.952 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.json) | +| [ViTPose-L-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.782 | 0.914 | 0.849 | 0.835 | 0.953 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.json) | +| [ViTPose-H-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.788 | 0.917 | 0.855 | 0.839 | 0.954 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.json) | +| [ViTPose-H-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py) | 256x192 | 0.788 | 0.914 | 0.853 | 0.841 | 0.956 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.yml new file mode 100644 index 0000000..2d2ba30 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.yml @@ -0,0 +1,145 @@ +Collections: +- Name: ViTPose + Paper: + Title: 'ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation' + URL: https://arxiv.org/abs/2204.12484 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/vitpose.md + Metadata: + Training Resources: 8x A100 GPUs +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py + In Collection: ViTPose + Metadata: + Architecture: &id001 + - ViTPose + - Classic Head + Model Size: Small + Training Data: &id002 + - COCO + - Human-Art + Name: td-hm_ViTPose-small_8xb64-210e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.737 + AP@0.5: 0.902 + AP@0.75: 0.811 + AR: 0.792 + AR@0.5: 0.942 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.381 + AP@0.5: 0.532 + AP@0.75: 0.405 + AR: 0.448 + AR@0.5: 0.602 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.738 + AP@0.5: 0.905 + AP@0.75: 0.802 + AR: 0.768 + AR@0.5: 0.911 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py + In Collection: ViTPose + Metadata: + Architecture: *id001 + Model Size: Base + Training Data: *id002 + Name: td-hm_ViTPose-base_8xb64-210e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.758 + AP@0.5: 0.906 + AP@0.75: 0.829 + AR: 0.812 + AR@0.5: 0.946 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.410 + AP@0.5: 0.549 + AP@0.75: 0.434 + AR: 0.475 + AR@0.5: 0.615 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.759 + AP@0.5: 0.905 + AP@0.75: 0.823 + AR: 0.790 + AR@0.5: 0.917 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-large_8xb64-210e_humanart-256x192.py + In Collection: ViTPose + Metadata: + Architecture: *id001 + Model Size: Large + Training Data: *id002 + Name: td-hm_ViTPose-large_8xb64-210e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.782 + AP@0.5: 0.914 + AP@0.75: 0.849 + AR: 0.835 + AR@0.5: 0.953 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.459 + AP@0.5: 0.592 + AP@0.75: 0.487 + AR: 0.525 + AR@0.5: 0.656 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.789 + AP@0.5: 0.916 + AP@0.75: 0.845 + AR: 0.819 + AR@0.5: 0.929 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py + In Collection: ViTPose + Metadata: + Architecture: *id001 + Model Size: Huge + Training Data: *id002 + Name: td-hm_ViTPose-huge_8xb64-210e_humanart-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.788 + AP@0.5: 0.914 + AP@0.75: 0.853 + AR: 0.841 + AR@0.5: 0.956 + Task: Body 2D Keypoint + - Dataset: Human-Art + Metrics: + AP: 0.468 + AP@0.5: 0.594 + AP@0.75: 0.498 + AR: 0.534 + AR@0.5: 0.655 + Task: Body 2D Keypoint + - Dataset: Human-Art(GT) + Metrics: + AP: 0.800 + AP@0.5: 0.926 + AP@0.75: 0.855 + AR: 0.828 + AR@0.5: 0.933 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.md new file mode 100644 index 0000000..bb19451 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.md @@ -0,0 +1,56 @@ + + +
+CPM (CVPR'2016) + +```bibtex +@inproceedings{wei2016convolutional, + title={Convolutional pose machines}, + author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser}, + booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition}, + pages={4724--4732}, + year={2016} +} +``` + +
+ + + +
+JHMDB (ICCV'2013) + +```bibtex +@inproceedings{Jhuang:ICCV:2013, + title = {Towards understanding action recognition}, + author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black}, + booktitle = {International Conf. on Computer Vision (ICCV)}, + month = Dec, + pages = {3192-3199}, + year = {2013} +} +``` + +
+ +Results on Sub-JHMDB dataset + +The models are pre-trained on MPII dataset only. NO test-time augmentation (multi-scale /rotation testing) is used. + +- Normalized by Person Size + +| Split | Arch | Input Size | Head | Sho | Elb | Wri | Hip | Knee | Ank | Mean | ckpt | log | +| :------ | :------------------------------------------------: | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-------------------------------------------------: | :------------------------------------------------: | +| Sub1 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py) | 368x368 | 96.1 | 91.9 | 81.0 | 78.9 | 96.6 | 90.8 | 87.3 | 89.5 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368_20201122.log.json) | +| Sub2 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py) | 368x368 | 98.1 | 93.6 | 77.1 | 70.9 | 94.0 | 89.1 | 84.7 | 87.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368_20201122.log.json) | +| Sub3 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py) | 368x368 | 97.9 | 94.9 | 87.3 | 84.0 | 98.6 | 94.4 | 86.2 | 92.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368_20201122.log.json) | +| Average | cpm | 368x368 | 97.4 | 93.5 | 81.5 | 77.9 | 96.4 | 91.4 | 86.1 | 89.8 | - | - | + +- Normalized by Torso Size + +| Split | Arch | Input Size | Head | Sho | Elb | Wri | Hip | Knee | Ank | Mean | ckpt | log | +| :------ | :------------------------------------------------: | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-------------------------------------------------: | :------------------------------------------------: | +| Sub1 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py) | 368x368 | 89.0 | 63.0 | 54.0 | 54.9 | 68.2 | 63.1 | 61.2 | 66.0 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368_20201122.log.json) | +| Sub2 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py) | 368x368 | 90.3 | 57.9 | 46.8 | 44.3 | 60.8 | 58.2 | 62.4 | 61.1 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368_20201122.log.json) | +| Sub3 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py) | 368x368 | 91.0 | 72.6 | 59.9 | 54.0 | 73.2 | 68.5 | 65.8 | 70.3 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368_20201122.log.json) | +| Average | cpm | 368x368 | 90.1 | 64.5 | 53.6 | 51.1 | 67.4 | 63.3 | 63.1 | 65.7 | - | - | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.yml new file mode 100644 index 0000000..f923d5b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.yml @@ -0,0 +1,116 @@ +Models: +- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py + In Collection: CPM + Metadata: + Architecture: &id001 + - CPM + Training Data: JHMDB + Name: td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368 + Results: + - Dataset: JHMDB + Metrics: + Ank: 87.3 + Elb: 81 + Head: 96.1 + Hip: 96.6 + Knee: 90.8 + Mean: 89.5 + Sho: 91.9 + Wri: 78.9 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth +- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py + In Collection: CPM + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368 + Results: + - Dataset: JHMDB + Metrics: + Ank: 84.7 + Elb: 77.1 + Head: 98.1 + Hip: 94.0 + Knee: 89.1 + Mean: 87.4 + Sho: 93.6 + Wri: 70.9 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth +- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py + In Collection: CPM + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368 + Results: + - Dataset: JHMDB + Metrics: + Ank: 86.2 + Elb: 87.3 + Head: 97.9 + Hip: 98.6 + Knee: 94.4 + Mean: 92.4 + Sho: 94.9 + Wri: 84.0 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth +- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py + In Collection: CPM + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368 + Results: + - Dataset: JHMDB + Metrics: + Ank: 61.2 + Elb: 54.0 + Head: 89.0 + Hip: 68.2 + Knee: 63.1 + Mean: 66.0 + Sho: 63.0 + Wri: 54.9 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth +- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py + In Collection: CPM + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368 + Results: + - Dataset: JHMDB + Metrics: + Ank: 62.4 + Elb: 46.8 + Head: 90.3 + Hip: 60.8 + Knee: 58.2 + Mean: 61.1 + Sho: 57.9 + Wri: 44.3 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth +- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py + In Collection: CPM + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368 + Results: + - Dataset: JHMDB + Metrics: + Ank: 65.8 + Elb: 59.9 + Head: 91.0 + Hip: 73.2 + Knee: 68.5 + Mean: 70.3 + Sho: 72.6 + Wri: 54.0 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.md new file mode 100644 index 0000000..d82672f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.md @@ -0,0 +1,81 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+JHMDB (ICCV'2013) + +```bibtex +@inproceedings{Jhuang:ICCV:2013, + title = {Towards understanding action recognition}, + author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black}, + booktitle = {International Conf. on Computer Vision (ICCV)}, + month = Dec, + pages = {3192-3199}, + year = {2013} +} +``` + +
+ +Results on Sub-JHMDB dataset + +The models are pre-trained on MPII dataset only. *NO* test-time augmentation (multi-scale /rotation testing) is used. + +- Normalized by Person Size + +| Split | Arch | Input Size | Head | Sho | Elb | Wri | Hip | Knee | Ank | Mean | ckpt | log | +| :------ | :------------------------------------------------: | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-------------------------------------------------: | :------------------------------------------------: | +| Sub1 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py) | 256x256 | 99.1 | 98.0 | 93.8 | 91.3 | 99.4 | 96.5 | 92.8 | 96.1 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256_20201122.log.json) | +| Sub2 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py) | 256x256 | 99.3 | 97.1 | 90.6 | 87.0 | 98.9 | 96.3 | 94.1 | 95.0 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256_20201122.log.json) | +| Sub3 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py) | 256x256 | 99.0 | 97.9 | 94.0 | 91.6 | 99.7 | 98.0 | 94.7 | 96.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256_20201122.log.json) | +| Average | pose_resnet_50 | 256x256 | 99.2 | 97.7 | 92.8 | 90.0 | 99.3 | 96.9 | 93.9 | 96.0 | - | - | +| Sub1 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py) | 256x256 | 99.1 | 98.5 | 94.6 | 92.0 | 99.4 | 94.6 | 92.5 | 96.1 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256_20201122.log.json) | +| Sub2 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py) | 256x256 | 99.3 | 97.8 | 91.0 | 87.0 | 99.1 | 96.5 | 93.8 | 95.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256_20201122.log.json) | +| Sub3 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py) | 256x256 | 98.8 | 98.4 | 94.3 | 92.1 | 99.8 | 97.5 | 93.8 | 96.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256_20201122.log.json) | +| Average | pose_resnet_50 (2 Deconv.) | 256x256 | 99.1 | 98.2 | 93.3 | 90.4 | 99.4 | 96.2 | 93.4 | 96.0 | - | - | + +- Normalized by Torso Size + +| Split | Arch | Input Size | Head | Sho | Elb | Wri | Hip | Knee | Ank | Mean | ckpt | log | +| :------ | :------------------------------------------------: | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-------------------------------------------------: | :------------------------------------------------: | +| Sub1 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py) | 256x256 | 93.3 | 83.2 | 74.4 | 72.7 | 85.0 | 81.2 | 78.9 | 81.9 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256_20201122.log.json) | +| Sub2 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py) | 256x256 | 94.1 | 74.9 | 64.5 | 62.5 | 77.9 | 71.9 | 78.6 | 75.5 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256_20201122.log.json) | +| Sub3 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py) | 256x256 | 97.0 | 82.2 | 74.9 | 70.7 | 84.7 | 83.7 | 84.2 | 82.9 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256_20201122.log.json) | +| Average | pose_resnet_50 | 256x256 | 94.8 | 80.1 | 71.3 | 68.6 | 82.5 | 78.9 | 80.6 | 80.1 | - | - | +| Sub1 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py) | 256x256 | 92.4 | 80.6 | 73.2 | 70.5 | 82.3 | 75.4 | 75.0 | 79.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256_20201122.log.json) | +| Sub2 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py) | 256x256 | 93.4 | 73.6 | 63.8 | 60.5 | 75.1 | 68.4 | 75.5 | 73.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256_20201122.log.json) | +| Sub3 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py) | 256x256 | 96.1 | 81.2 | 72.6 | 67.9 | 83.6 | 80.9 | 81.5 | 81.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256_20201122.log.json) | +| Average | pose_resnet_50 (2 Deconv.) | 256x256 | 94.0 | 78.5 | 69.9 | 66.3 | 80.3 | 74.9 | 77.3 | 78.0 | - | - | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.yml new file mode 100644 index 0000000..a4a9de3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.yml @@ -0,0 +1,231 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: JHMDB + Name: td-hm_res50_8xb64-20e_jhmdb-sub1-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 92.8 + Elb: 93.8 + Head: 99.1 + Hip: 99.4 + Knee: 96.5 + Mean: 96.1 + Sho: 98.0 + Wri: 91.3 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50_8xb64-20e_jhmdb-sub2-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 94.1 + Elb: 90.6 + Head: 99.3 + Hip: 98.9 + Knee: 96.3 + Mean: 95.0 + Sho: 97.1 + Wri: 87.0 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50_8xb64-20e_jhmdb-sub3-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 94.7 + Elb: 94.0 + Head: 99.0 + Hip: 99.7 + Knee: 98.0 + Mean: 96.7 + Sho: 97.9 + Wri: 91.6 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 92.5 + Elb: 94.6 + Head: 99.1 + Hip: 99.4 + Knee: 94.6 + Mean: 96.1 + Sho: 98.5 + Wri: 92.0 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 93.8 + Elb: 91.0 + Head: 99.3 + Hip: 99.1 + Knee: 96.5 + Mean: 95.2 + Sho: 97.8 + Wri: 87.0 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 93.8 + Elb: 94.3 + Head: 98.8 + Hip: 99.8 + Knee: 97.5 + Mean: 96.7 + Sho: 98.4 + Wri: 92.1 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50_8xb64-20e_jhmdb-sub1-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 78.9 + Elb: 74.4 + Head: 93.3 + Hip: 85.0 + Knee: 81.2 + Mean: 81.9 + Sho: 83.2 + Wri: 72.7 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50_8xb64-20e_jhmdb-sub2-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 78.6 + Elb: 64.5 + Head: 94.1 + Hip: 77.9 + Knee: 71.9 + Mean: 75.5 + Sho: 74.9 + Wri: 62.5 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50_8xb64-20e_jhmdb-sub3-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 84.2 + Elb: 74.9 + Head: 97.0 + Hip: 84.7 + Knee: 83.7 + Mean: 82.9 + Sho: 82.2 + Wri: 70.7 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 75.0 + Elb: 73.2 + Head: 92.4 + Hip: 82.3 + Knee: 75.4 + Mean: 79.2 + Sho: 80.6 + Wri: 70.5 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 75.5 + Elb: 63.8 + Head: 93.4 + Hip: 75.1 + Knee: 68.4 + Mean: 73.7 + Sho: 73.6 + Wri: 60.5 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: JHMDB + Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256 + Results: + - Dataset: JHMDB + Metrics: + Ank: 81.5 + Elb: 72.6 + Head: 96.1 + Hip: 83.6 + Knee: 80.9 + Mean: 81.2 + Sho: 81.2 + Wri: 67.9 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py new file mode 100644 index 0000000..fb59f0a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py @@ -0,0 +1,127 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=40, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=40, + milestones=[20, 30], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(368, 368), heatmap_size=(46, 46), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CPM', + in_channels=3, + out_channels=15, + feat_channels=128, + num_stages=6), + head=dict( + type='CPMHead', + in_channels=15, + out_channels=15, + num_stages=6, + deconv_out_channels=None, + final_layer=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'JhmdbDataset' +data_mode = 'topdown' +data_root = 'data/jhmdb/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub1_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub1_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py new file mode 100644 index 0000000..84875ca --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py @@ -0,0 +1,127 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=40, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=40, + milestones=[20, 30], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(368, 368), heatmap_size=(46, 46), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CPM', + in_channels=3, + out_channels=15, + feat_channels=128, + num_stages=6), + head=dict( + type='CPMHead', + in_channels=15, + out_channels=15, + num_stages=6, + deconv_out_channels=None, + final_layer=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'JhmdbDataset' +data_mode = 'topdown' +data_root = 'data/jhmdb/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub2_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub2_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py new file mode 100644 index 0000000..9995cbf --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py @@ -0,0 +1,127 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=40, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=40, + milestones=[20, 30], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(368, 368), heatmap_size=(46, 46), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CPM', + in_channels=3, + out_channels=15, + feat_channels=128, + num_stages=6), + head=dict( + type='CPMHead', + in_channels=15, + out_channels=15, + num_stages=6, + deconv_out_channels=None, + final_layer=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'JhmdbDataset' +data_mode = 'topdown' +data_root = 'data/jhmdb/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub3_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub3_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py new file mode 100644 index 0000000..8eba9a4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=40, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=40, + milestones=[20, 30], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(32, 32), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ResNet', depth=50), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=15, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) +load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501 + +# base dataset settings +dataset_type = 'JhmdbDataset' +data_mode = 'topdown' +data_root = 'data/jhmdb/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub1_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub1_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py new file mode 100644 index 0000000..627f74e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=40, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=40, + milestones=[20, 30], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(32, 32), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ResNet', depth=50), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=15, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) +load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501 + +# base dataset settings +dataset_type = 'JhmdbDataset' +data_mode = 'topdown' +data_root = 'data/jhmdb/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub2_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub2_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py new file mode 100644 index 0000000..c61e18b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=40, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=40, + milestones=[20, 30], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(32, 32), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ResNet', depth=50), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=15, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) +load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501 + +# base dataset settings +dataset_type = 'JhmdbDataset' +data_mode = 'topdown' +data_root = 'data/jhmdb/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub3_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub3_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py new file mode 100644 index 0000000..2bb5068 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=20, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=20, + milestones=[8, 15], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ResNet', depth=50), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=15, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) +load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501 + +# base dataset settings +dataset_type = 'JhmdbDataset' +data_mode = 'topdown' +data_root = 'data/jhmdb/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub1_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub1_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py new file mode 100644 index 0000000..3cdcaff --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=20, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=20, + milestones=[8, 15], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ResNet', depth=50), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=15, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) +load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501 + +# base dataset settings +dataset_type = 'JhmdbDataset' +data_mode = 'topdown' +data_root = 'data/jhmdb/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub2_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub2_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py new file mode 100644 index 0000000..151a2a3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=20, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=20, + milestones=[8, 15], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ResNet', depth=50), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=15, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) +load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501 + +# base dataset settings +dataset_type = 'JhmdbDataset' +data_mode = 'topdown' +data_root = 'data/jhmdb/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub3_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/Sub3_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.md new file mode 100644 index 0000000..ac25f9c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.md @@ -0,0 +1,39 @@ + + +
+CPM (CVPR'2016) + +```bibtex +@inproceedings{wei2016convolutional, + title={Convolutional pose machines}, + author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser}, + booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition}, + pages={4724--4732}, + year={2016} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [cpm](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py) | 368x368 | 0.876 | 0.285 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368-116e62b8_20200822.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368_20200822.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.yml new file mode 100644 index 0000000..077e0cb --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.yml @@ -0,0 +1,15 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py + In Collection: CPM + Metadata: + Architecture: + - CPM + Training Data: MPII + Name: td-hm_cpm_8xb64-210e_mpii-368x368 + Results: + - Dataset: MPII + Metrics: + Mean: 0.876 + Mean@0.1: 0.285 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368-116e62b8_20200822.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..d9c4552 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py @@ -0,0 +1,210 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/', +# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file=f'{data_root}/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.md new file mode 100644 index 0000000..1256ae9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.md @@ -0,0 +1,57 @@ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py) | 256x256 | 0.902 | 0.303 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-mpii_pt-in1k_210e-256x256-68d0402f_20230208.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-mpii_pt-in1k_210e-256x256-68d0402f_20230208.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.yml new file mode 100644 index 0000000..e1c738c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py + In Collection: UDP + Metadata: + Architecture: + - UDP + - CSPNeXt + Training Data: MPII + Name: cspnext-m_udp_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.902 + Mean@0.1: 0.303 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-mpii_pt-in1k_210e-256x256-68d0402f_20230208.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.md new file mode 100644 index 0000000..ca29dc2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.md @@ -0,0 +1,41 @@ + + +
+Hourglass (ECCV'2016) + +```bibtex +@inproceedings{newell2016stacked, + title={Stacked hourglass networks for human pose estimation}, + author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia}, + booktitle={European conference on computer vision}, + pages={483--499}, + year={2016}, + organization={Springer} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_hourglass_52](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py) | 256x256 | 0.889 | 0.317 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_256x256-ae358435_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_256x256_20200812.log.json) | +| [pose_hourglass_52](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py) | 384x384 | 0.894 | 0.367 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_384x384-04090bc3_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_384x384_20200812.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.yml new file mode 100644 index 0000000..17a5c3c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.yml @@ -0,0 +1,28 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py + In Collection: Hourglass + Metadata: + Architecture: &id001 + - Hourglass + Training Data: MPII + Name: td-hm_hourglass52_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.889 + Mean@0.1: 0.317 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_256x256-ae358435_20200812.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py + In Collection: Hourglass + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_hourglass52_8xb32-210e_mpii-384x384 + Results: + - Dataset: MPII + Metrics: + Mean: 0.894 + Mean@0.1: 0.367 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_384x384-04090bc3_20200812.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.md new file mode 100644 index 0000000..5a089f2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.md @@ -0,0 +1,57 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_hrnet_w32_dark](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py) | 256x256 | 0.904 | 0.354 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_dark-f1601c5b_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_dark_20200927.log.json) | +| [pose_hrnet_w48_dark](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py) | 256x256 | 0.905 | 0.360 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_dark-0decd39f_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_dark_20200927.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.yml new file mode 100644 index 0000000..1f19ecf --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.yml @@ -0,0 +1,29 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py + In Collection: DarkPose + Metadata: + Architecture: &id001 + - HRNet + - DarkPose + Training Data: MPII + Name: td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.904 + Mean@0.1: 0.354 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_dark-f1601c5b_20200927.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.905 + Mean@0.1: 0.36 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_dark-0decd39f_20200927.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.md new file mode 100644 index 0000000..c8ea9e3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.md @@ -0,0 +1,40 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py) | 256x256 | 0.900 | 0.334 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256-6c4f923f_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_20200812.log.json) | +| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py) | 256x256 | 0.901 | 0.337 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256-92cab7bd_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_20200812.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.yml new file mode 100644 index 0000000..b2ead58 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.yml @@ -0,0 +1,28 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py + In Collection: HRNet + Metadata: + Architecture: &id001 + - HRNet + Training Data: MPII + Name: td-hm_hrnet-w32_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.9 + Mean@0.1: 0.334 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256-6c4f923f_20200812.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_hrnet-w48_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.901 + Mean@0.1: 0.337 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256-92cab7bd_20200812.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.md new file mode 100644 index 0000000..21211e6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.md @@ -0,0 +1,39 @@ + + +
+LiteHRNet (CVPR'2021) + +```bibtex +@inproceedings{Yulitehrnet21, + title={Lite-HRNet: A Lightweight High-Resolution Network}, + author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong}, + booktitle={CVPR}, + year={2021} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [LiteHRNet-18](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py) | 256x256 | 0.859 | 0.260 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_mpii_256x256-cabd7984_20210623.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_mpii_256x256_20210623.log.json) | +| [LiteHRNet-30](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py) | 256x256 | 0.869 | 0.271 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_mpii_256x256-faae8bd8_20210622.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_mpii_256x256_20210622.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.yml new file mode 100644 index 0000000..940eaf6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.yml @@ -0,0 +1,28 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py + In Collection: LiteHRNet + Metadata: + Architecture: &id001 + - LiteHRNet + Training Data: MPII + Name: td-hm_litehrnet-18_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.859 + Mean@0.1: 0.26 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_mpii_256x256-cabd7984_20210623.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py + In Collection: LiteHRNet + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_litehrnet-30_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.869 + Mean@0.1: 0.271 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_mpii_256x256-faae8bd8_20210622.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.md new file mode 100644 index 0000000..6343855 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.md @@ -0,0 +1,39 @@ + + +
+MobilenetV2 (CVPR'2018) + +```bibtex +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py) | 256x256 | 0.854 | 0.234 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_mpii_256x256-e068afa7_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_mpii_256x256_20200812.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.yml new file mode 100644 index 0000000..09d65dd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - MobilenetV2 + Training Data: MPII + Name: td-hm_mobilenetv2_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.854 + Mean@0.1: 0.234 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_mpii_256x256-e068afa7_20200812.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.md new file mode 100644 index 0000000..790746b --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.md @@ -0,0 +1,58 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.882 | 0.286 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256_20200812.log.json) | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.888 | 0.290 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_256x256-416f5d71_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_256x256_20200812.log.json) | +| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py) | 256x256 | 0.889 | 0.303 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_256x256-3ecba29d_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_256x256_20200812.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.yml new file mode 100644 index 0000000..14ae910 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.yml @@ -0,0 +1,42 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: MPII + Name: td-hm_res50_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.882 + Mean@0.1: 0.286 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_res101_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.888 + Mean@0.1: 0.29 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_256x256-416f5d71_20200812.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_res152_8xb32-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.889 + Mean@0.1: 0.303 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_256x256-3ecba29d_20200812.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.md new file mode 100644 index 0000000..09ffe42 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.md @@ -0,0 +1,41 @@ + + +
+ResNetV1D (CVPR'2019) + +```bibtex +@inproceedings{he2019bag, + title={Bag of tricks for image classification with convolutional neural networks}, + author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={558--567}, + year={2019} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.881 | 0.290 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_mpii_256x256-2337a92e_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_mpii_256x256_20200812.log.json) | +| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.883 | 0.295 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_mpii_256x256-2851d710_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_mpii_256x256_20200812.log.json) | +| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py) | 256x256 | 0.888 | 0.300 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_mpii_256x256-8b10a87c_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_mpii_256x256_20200812.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.yml new file mode 100644 index 0000000..b6c902d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.yml @@ -0,0 +1,42 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNetV1D + Training Data: MPII + Name: td-hm_resnetv1d50_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.881 + Mean@0.1: 0.29 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_mpii_256x256-2337a92e_20200812.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_resnetv1d101_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.883 + Mean@0.1: 0.295 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_mpii_256x256-2851d710_20200812.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_resnetv1d152_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.888 + Mean@0.1: 0.3 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_mpii_256x256-8b10a87c_20200812.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.md new file mode 100644 index 0000000..64eb483 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.md @@ -0,0 +1,39 @@ + + +
+ResNext (CVPR'2017) + +```bibtex +@inproceedings{xie2017aggregated, + title={Aggregated residual transformations for deep neural networks}, + author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1492--1500}, + year={2017} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_resnext_152](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py) | 256x256 | 0.887 | 0.294 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_mpii_256x256-df302719_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_mpii_256x256_20200927.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.yml new file mode 100644 index 0000000..feb338e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - ResNext + Training Data: MPII + Name: td-hm_resnext152_8xb32-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.887 + Mean@0.1: 0.294 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_mpii_256x256-df302719_20200927.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.md new file mode 100644 index 0000000..eaa8a64 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.md @@ -0,0 +1,40 @@ + + +
+SCNet (CVPR'2020) + +```bibtex +@inproceedings{liu2020improving, + title={Improving Convolutional Networks with Self-Calibrated Convolutions}, + author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={10096--10105}, + year={2020} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_scnet_50](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.888 | 0.290 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_mpii_256x256-a54b6af5_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_mpii_256x256_20200812.log.json) | +| [pose_scnet_101](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.887 | 0.293 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_mpii_256x256-b4c2d184_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_mpii_256x256_20200812.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.yml new file mode 100644 index 0000000..d132448 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.yml @@ -0,0 +1,29 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - SCNet + Training Data: MPII + Name: td-hm_scnet50_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.888 + Mean@0.1: 0.29 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_mpii_256x256-a54b6af5_20200812.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_scnet101_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.887 + Mean@0.1: 0.293 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_mpii_256x256-b4c2d184_20200812.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.md new file mode 100644 index 0000000..812fd70 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.md @@ -0,0 +1,43 @@ + + +
+SEResNet (CVPR'2018) + +```bibtex +@inproceedings{hu2018squeeze, + title={Squeeze-and-excitation networks}, + author={Hu, Jie and Shen, Li and Sun, Gang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={7132--7141}, + year={2018} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_seresnet_50](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.884 | 0.292 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_mpii_256x256-1bb21f79_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_mpii_256x256_20200927.log.json) | +| [pose_seresnet_101](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.884 | 0.295 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_mpii_256x256-0ba14ff5_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_mpii_256x256_20200927.log.json) | +| [pose_seresnet_152\*](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py) | 256x256 | 0.884 | 0.287 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_mpii_256x256-6ea1e774_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_mpii_256x256_20200927.log.json) | + +Note that * means without imagenet pre-training. diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.yml new file mode 100644 index 0000000..8d6a3e4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.yml @@ -0,0 +1,42 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - SEResNet + Training Data: MPII + Name: td-hm_seresnet50_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.884 + Mean@0.1: 0.292 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_mpii_256x256-1bb21f79_20200927.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_seresnet101_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.884 + Mean@0.1: 0.295 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_mpii_256x256-0ba14ff5_20200927.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-hm_seresnet152_8xb32-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.884 + Mean@0.1: 0.287 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_mpii_256x256-6ea1e774_20200927.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.md new file mode 100644 index 0000000..b8ccb8c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.md @@ -0,0 +1,39 @@ + + +
+ShufflenetV1 (CVPR'2018) + +```bibtex +@inproceedings{zhang2018shufflenet, + title={Shufflenet: An extremely efficient convolutional neural network for mobile devices}, + author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={6848--6856}, + year={2018} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py) | 256x256 | 0.824 | 0.195 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_mpii_256x256-dcc1c896_20200925.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_mpii_256x256_20200925.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.yml new file mode 100644 index 0000000..66d6e4e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - ShufflenetV1 + Training Data: MPII + Name: td-hm_shufflenetv1_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.824 + Mean@0.1: 0.195 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_mpii_256x256-dcc1c896_20200925.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.md new file mode 100644 index 0000000..7f13623 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.md @@ -0,0 +1,39 @@ + + +
+ShufflenetV2 (ECCV'2018) + +```bibtex +@inproceedings{ma2018shufflenet, + title={Shufflenet v2: Practical guidelines for efficient cnn architecture design}, + author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={116--131}, + year={2018} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py) | 256x256 | 0.828 | 0.205 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_mpii_256x256-4fb9df2d_20200925.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_mpii_256x256_20200925.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.yml new file mode 100644 index 0000000..71ff431 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - ShufflenetV2 + Training Data: MPII + Name: td-hm_shufflenetv2_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.828 + Mean@0.1: 0.205 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_mpii_256x256-4fb9df2d_20200925.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py new file mode 100644 index 0000000..cf47ecd --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(368, 368), heatmap_size=(46, 46), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CPM', + in_channels=3, + out_channels=16, + feat_channels=128, + num_stages=6), + head=dict( + type='CPMHead', + in_channels=16, + out_channels=16, + num_stages=6, + deconv_out_channels=None, + final_layer=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py new file mode 100644 index 0000000..1754065 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py @@ -0,0 +1,118 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(384, 384), heatmap_size=(96, 96), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HourglassNet', + num_stacks=1, + ), + head=dict( + type='CPMHead', + in_channels=256, + out_channels=16, + num_stages=1, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..07f13ce --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py @@ -0,0 +1,118 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HourglassNet', + num_stacks=1, + ), + head=dict( + type='CPMHead', + in_channels=256, + out_channels=16, + num_stages=1, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..7ee018d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py @@ -0,0 +1,146 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=16, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..f22c0f8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=16, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..3101359 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py @@ -0,0 +1,146 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=16, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..9435d79 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=16, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..a95e33d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py @@ -0,0 +1,137 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='LiteHRNet', + in_channels=3, + extra=dict( + stem=dict(stem_channels=32, out_channels=32, expand_ratio=1), + num_stages=3, + stages_spec=dict( + num_modules=(2, 4, 2), + num_branches=(2, 3, 4), + num_blocks=(2, 2, 2), + module_type=('LITE', 'LITE', 'LITE'), + with_fuse=(True, True, True), + reduce_ratios=(8, 8, 8), + num_channels=( + (40, 80), + (40, 80, 160), + (40, 80, 160, 320), + )), + with_head=True, + )), + head=dict( + type='HeatmapHead', + in_channels=40, + out_channels=16, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..a7b4400 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py @@ -0,0 +1,137 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='LiteHRNet', + in_channels=3, + extra=dict( + stem=dict(stem_channels=32, out_channels=32, expand_ratio=1), + num_stages=3, + stages_spec=dict( + num_modules=(3, 8, 3), + num_branches=(2, 3, 4), + num_blocks=(2, 2, 2), + module_type=('LITE', 'LITE', 'LITE'), + with_fuse=(True, True, True), + reduce_ratios=(8, 8, 8), + num_channels=( + (40, 80), + (40, 80, 160), + (40, 80, 160, 320), + )), + with_head=True, + )), + head=dict( + type='HeatmapHead', + in_channels=40, + out_channels=16, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..6b40e19 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py @@ -0,0 +1,118 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MobileNetV2', + widen_factor=1., + out_indices=(7, ), + init_cfg=dict(type='Pretrained', checkpoint='mmcls://mobilenet_v2'), + ), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..0bd5fd8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py new file mode 100644 index 0000000..a2d86f8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..d22c405 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..25c4087 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNetV1d', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet101_v1d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..ce43cf3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNetV1d', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet152_v1d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..a2853f8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNetV1d', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet50_v1d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py new file mode 100644 index 0000000..8bfe3ef --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py @@ -0,0 +1,118 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeXt', + depth=152, + init_cfg=dict( + type='Pretrained', checkpoint='mmcls://resnext152_32x4d'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..9ae0c8d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SCNet', + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/scnet101-94250a77.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..6e2206a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SCNet', + depth=50, + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/scnet50-7ef0a199.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..7ead848 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SEResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py new file mode 100644 index 0000000..7c2486d --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py @@ -0,0 +1,116 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SEResNet', + depth=152, + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..c14ba34 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SEResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..1b8ac62 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ShuffleNetV1', + groups=3, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v1'), + ), + head=dict( + type='HeatmapHead', + in_channels=960, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..e39aff8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ShuffleNetV2', + widen_factor=1.0, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v2'), + ), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file='data/mpii/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.md new file mode 100644 index 0000000..a6c5899 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.md @@ -0,0 +1,55 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+PoseTrack18 (CVPR'2018) + +```bibtex +@inproceedings{andriluka2018posetrack, + title={Posetrack: A benchmark for human pose estimation and tracking}, + author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={5167--5176}, + year={2018} +} +``` + +
+ +Results on PoseTrack2018 val with ground-truth bounding boxes + +| Arch | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total | ckpt | log | +| :--------------------------------------------------- | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :---: | :---------------------------------------------------: | :--------------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py) | 256x192 | 86.2 | 89.0 | 84.5 | 79.2 | 82.3 | 82.5 | 78.7 | 83.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192_20201028.log.json) | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py) | 384x288 | 87.1 | 89.0 | 85.1 | 80.2 | 80.6 | 82.8 | 79.6 | 83.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288_20211130.log.json) | +| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py) | 256x192 | 88.3 | 90.2 | 86.0 | 81.0 | 80.7 | 83.3 | 80.6 | 84.6 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192_20211130.log.json) | +| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py) | 384x288 | 87.8 | 90.0 | 86.2 | 81.3 | 81.0 | 83.4 | 80.9 | 84.6 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288_20211130.log.json) | + +The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18. + +Results on PoseTrack2018 val with [MMDetection](https://github.com/open-mmlab/mmdetection) pre-trained [Cascade R-CNN](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth) (X-101-64x4d-FPN) human detector + +| Arch | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total | ckpt | log | +| :--------------------------------------------------- | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :---: | :---------------------------------------------------: | :--------------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py) | 256x192 | 78.0 | 82.9 | 79.5 | 73.8 | 76.9 | 76.6 | 70.2 | 76.9 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192_20201028.log.json) | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py) | 384x288 | 79.9 | 83.6 | 80.4 | 74.5 | 74.8 | 76.1 | 70.5 | 77.3 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288_20211130.log.json) | +| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py) | 256x192 | 80.1 | 83.4 | 80.6 | 74.8 | 74.3 | 76.8 | 70.5 | 77.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192_20211130.log.json) | +| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py) | 384x288 | 80.2 | 83.8 | 80.9 | 75.2 | 74.7 | 76.7 | 71.7 | 77.8 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288_20211130.log.json) | + +The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18. diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.yml new file mode 100644 index 0000000..c2a0787 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.yml @@ -0,0 +1,154 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py + In Collection: HRNet + Metadata: + Architecture: &id001 + - HRNet + Training Data: PoseTrack18 + Name: td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192 + Results: + - Dataset: PoseTrack18 + Metrics: + Ankl: 78.7 + Elb: 84.5 + Head: 86.2 + Hip: 82.3 + Knee: 82.5 + Shou: 89 + Total: 83.4 + Wri: 79.2 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: PoseTrack18 + Name: td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288 + Results: + - Dataset: PoseTrack18 + Metrics: + Ankl: 79.6 + Elb: 84.5 + Head: 87.1 + Hip: 80.6 + Knee: 82.8 + Shou: 89 + Total: 83.7 + Wri: 80.2 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: PoseTrack18 + Name: td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192 + Results: + - Dataset: PoseTrack18 + Metrics: + Ankl: 79.6 + Elb: 85.1 + Head: 88.3 + Hip: 80.6 + Knee: 82.8 + Shou: 90.2 + Total: 84.6 + Wri: 81 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: PoseTrack18 + Name: td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288 + Results: + - Dataset: PoseTrack18 + Metrics: + Ankl: 80.6 + Elb: 86.2 + Head: 87.8 + Hip: 81 + Knee: 83.4 + Shou: 90 + Total: 84.6 + Wri: 81.3 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: PoseTrack18 + Name: td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192 + Results: + - Dataset: PoseTrack18 + Metrics: + Ankl: 70.2 + Elb: 79.5 + Head: 78.0 + Hip: 76.9 + Knee: 76.6 + Shou: 82.9 + Total: 76.9 + Wri: 73.8 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: PoseTrack18 + Name: td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288 + Results: + - Dataset: PoseTrack18 + Metrics: + Ankl: 70.5 + Elb: 80.4 + Head: 79.9 + Hip: 74.8 + Knee: 76.1 + Shou: 83.6 + Total: 77.3 + Wri: 74.5 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: PoseTrack18 + Name: td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192 + Results: + - Dataset: PoseTrack18 + Metrics: + Ankl: 70.4 + Elb: 80.6 + Head: 80.1 + Hip: 74.3 + Knee: 76.8 + Shou: 83.4 + Total: 77.4 + Wri: 74.8 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: PoseTrack18 + Name: td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288 + Results: + - Dataset: PoseTrack18 + Metrics: + Ankl: 71.7 + Elb: 80.9 + Head: 80.2 + Hip: 74.7 + Knee: 76.7 + Shou: 83.8 + Total: 77.8 + Wri: 75.2 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.md new file mode 100644 index 0000000..e172780 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.md @@ -0,0 +1,58 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+PoseTrack18 (CVPR'2018) + +```bibtex +@inproceedings{andriluka2018posetrack, + title={Posetrack: A benchmark for human pose estimation and tracking}, + author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={5167--5176}, + year={2018} +} +``` + +
+ +Results on PoseTrack2018 val with ground-truth bounding boxes + +| Arch | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total | ckpt | log | +| :--------------------------------------------------- | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :---: | :---------------------------------------------------: | :--------------------------------------------------: | +| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py) | 256x192 | 86.5 | 87.7 | 82.5 | 75.8 | 80.1 | 78.8 | 74.2 | 81.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192-a62807c7_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192_20201028.log.json) | + +The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18. diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.yml new file mode 100644 index 0000000..a15fa9f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.yml @@ -0,0 +1,22 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: PoseTrack18 + Name: td-hm_res50_8xb64-20e_posetrack18-256x192 + Results: + - Dataset: PoseTrack18 + Metrics: + Ankl: 74.2 + Elb: 82.5 + Head: 86.5 + Hip: 80.1 + Knee: 78.8 + Shou: 87.7 + Total: 81.2 + Wri: 75.8 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192-a62807c7_20201028.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py new file mode 100644 index 0000000..63e35a4 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py @@ -0,0 +1,155 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=20, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=20, + milestones=[10, 15], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) + +# load from the pretrained model +load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth' # noqa: E501 + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'PoseTrack18Dataset' +data_mode = 'topdown' +data_root = 'data/posetrack18/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_val.json', + # comment `bbox_file` and '`filter_cfg` if use gt bbox for evaluation + bbox_file='data/posetrack18/annotations/' + 'posetrack18_val_human_detections.json', + filter_cfg=dict(bbox_score_thr=0.4), + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='PoseTrack18Metric', + ann_file=data_root + 'annotations/posetrack18_val.json', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py new file mode 100644 index 0000000..04a4522 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py @@ -0,0 +1,155 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=20, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=20, + milestones=[10, 15], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) + +# load from the pretrained model +load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288-ca5956af_20220909.pth' # noqa: E501 + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'PoseTrack18Dataset' +data_mode = 'topdown' +data_root = 'data/posetrack18/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_val.json', + # comment `bbox_file` and '`filter_cfg` if use gt bbox for evaluation + bbox_file='data/posetrack18/annotations/' + 'posetrack18_val_human_detections.json', + filter_cfg=dict(bbox_score_thr=0.4), + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='PoseTrack18Metric', + ann_file=data_root + 'annotations/posetrack18_val.json', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py new file mode 100644 index 0000000..90e81d0 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py @@ -0,0 +1,155 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=20, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=20, + milestones=[10, 15], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) + +# load from the pretrained model +load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth' # noqa: E501 + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'PoseTrack18Dataset' +data_mode = 'topdown' +data_root = 'data/posetrack18/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_val.json', + # comment `bbox_file` and '`filter_cfg` if use gt bbox for evaluation + bbox_file='data/posetrack18/annotations/' + 'posetrack18_val_human_detections.json', + filter_cfg=dict(bbox_score_thr=0.4), + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='PoseTrack18Metric', + ann_file=data_root + 'annotations/posetrack18_val.json', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py new file mode 100644 index 0000000..32189ff --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py @@ -0,0 +1,155 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=20, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=20, + milestones=[10, 15], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) + +# load from the pretrained model +load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288-c161b7de_20220915.pth' # noqa: E501 + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'PoseTrack18Dataset' +data_mode = 'topdown' +data_root = 'data/posetrack18/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_val.json', + # comment `bbox_file` and '`filter_cfg` if use gt bbox for evaluation + bbox_file='data/posetrack18/annotations/' + 'posetrack18_val_human_detections.json', + filter_cfg=dict(bbox_score_thr=0.4), + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='PoseTrack18Metric', + ann_file=data_root + 'annotations/posetrack18_val.json', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py new file mode 100644 index 0000000..22c7c11 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py @@ -0,0 +1,126 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=20, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=20, + milestones=[10, 15], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) + +# load from the pretrained model +load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth' # noqa: E501 + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'PoseTrack18Dataset' +data_mode = 'topdown' +data_root = 'data/posetrack18/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/posetrack18_val.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='PoseTrack18Metric', + ann_file=data_root + 'annotations/posetrack18_val.json', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/README.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/README.md new file mode 100644 index 0000000..ed247c3 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/README.md @@ -0,0 +1,32 @@ +# Top-down regression-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, regression based methods directly regress the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [Deeppose: Human pose estimation via deep neural networks](http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html). + +
+ +
+ +## Results and Models + +### COCO Dataset + +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Model | Input Size | AP | AR | Details and Download | +| :--------------: | :--------: | :---: | :---: | :-------------------------------------------------------: | +| ResNet-152+RLE | 256x192 | 0.731 | 0.805 | [resnet_rle_coco.md](./coco/resnet_rle_coco.md) | +| ResNet-101+RLE | 256x192 | 0.722 | 0.768 | [resnet_rle_coco.md](./coco/resnet_rle_coco.md) | +| ResNet-50+RLE | 256x192 | 0.706 | 0.768 | [resnet_rle_coco.md](./coco/resnet_rle_coco.md) | +| MobileNet-v2+RLE | 256x192 | 0.593 | 0.644 | [mobilenetv2_rle_coco.md](./coco/mobilenetv2_rle_coco.md) | +| ResNet-152 | 256x192 | 0.584 | 0.688 | [resnet_coco.md](./coco/resnet_coco.md) | +| ResNet-101 | 256x192 | 0.562 | 0.670 | [resnet_coco.md](./coco/resnet_coco.md) | +| ResNet-50 | 256x192 | 0.528 | 0.639 | [resnet_coco.md](./coco/resnet_coco.md) | + +### MPII Dataset + +| Model | Input Size | PCKh@0.5 | PCKh@0.1 | Details and Download | +| :-----------: | :--------: | :------: | :------: | :---------------------------------------------: | +| ResNet-50+RLE | 256x256 | 0.861 | 0.277 | [resnet_rle_mpii.md](./mpii/resnet_rle_mpii.md) | +| ResNet-152 | 256x256 | 0.850 | 0.208 | [resnet_mpii.md](./mpii/resnet_mpii.md) | +| ResNet-101 | 256x256 | 0.841 | 0.200 | [resnet_mpii.md](./mpii/resnet_mpii.md) | +| ResNet-50 | 256x256 | 0.826 | 0.180 | [resnet_mpii.md](./mpii/resnet_mpii.md) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.md new file mode 100644 index 0000000..825c40c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.md @@ -0,0 +1,74 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+RLE (ICCV'2021) + +```bibtex +@inproceedings{li2021human, + title={Human pose regression with residual log-likelihood estimation}, + author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={11025--11034}, + year={2021} +} +``` + +
+ + + +
+MobilenetV2 (CVPR'2018) + +```bibtex +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [deeppose_mobilenetv2_rle_pretrained](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py) | 256x192 | 0.593 | 0.836 | 0.660 | 0.644 | 0.877 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192-39b73bd5_20220922.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192-39b73bd5_20220922.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.yml new file mode 100644 index 0000000..1dda49e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.yml @@ -0,0 +1,20 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py + In Collection: RLE + Metadata: + Architecture: &id001 + - DeepPose + - RLE + - MobileNet + Training Data: COCO + Name: td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.593 + AP@0.5: 0.836 + AP@0.75: 0.66 + AR: 0.644 + AR@0.5: 0.877 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192-39b73bd5_20220922.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.md new file mode 100644 index 0000000..fd9a8c8 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.md @@ -0,0 +1,59 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [deeppose_resnet_50](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.541 | 0.824 | 0.601 | 0.649 | 0.893 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192-72ef04f3_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192-72ef04f3_20220913.log.json) | +| [deeppose_resnet_101](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py) | 256x192 | 0.562 | 0.831 | 0.629 | 0.670 | 0.900 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192-2f247111_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192_20210205.log.json) | +| [deeppose_resnet_152](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py) | 256x192 | 0.584 | 0.842 | 0.659 | 0.688 | 0.907 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192-7df89a88_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192_20210205.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.yml new file mode 100644 index 0000000..07fae5e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.yml @@ -0,0 +1,57 @@ +Collections: +- Name: DeepPose + Paper: + Title: "Deeppose: Human pose estimation via deep neural networks" + URL: http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/deeppose.md +Models: +- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py + In Collection: DeepPose + Metadata: + Architecture: &id001 + - DeepPose + - ResNet + Training Data: COCO + Name: td-reg_res50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.541 + AP@0.5: 0.824 + AP@0.75: 0.601 + AR: 0.649 + AR@0.5: 0.893 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192-72ef04f3_20220913.pth +- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py + In Collection: DeepPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-reg_res101_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.562 + AP@0.5: 0.831 + AP@0.75: 0.629 + AR: 0.67 + AR@0.5: 0.9 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192-2f247111_20210205.pth +- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py + In Collection: DeepPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-reg_res152_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.584 + AP@0.5: 0.842 + AP@0.75: 0.659 + AR: 0.688 + AR@0.5: 0.907 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192-7df89a88_20210205.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.md new file mode 100644 index 0000000..365d244 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.md @@ -0,0 +1,78 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+RLE (ICCV'2021) + +```bibtex +@inproceedings{li2021human, + title={Human pose regression with residual log-likelihood estimation}, + author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={11025--11034}, + year={2021} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [deeppose_resnet_50_rle](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py) | 256x192 | 0.706 | 0.888 | 0.776 | 0.753 | 0.924 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192-d37efd64_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192-d37efd64_20220913.log.json) | +| [deeppose_resnet_50_rle_pretrained](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py) | 256x192 | 0.719 | 0.891 | 0.788 | 0.764 | 0.925 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192-2cb494ee_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192-2cb494ee_20220913.log.json) | +| [deeppose_resnet_101_rle](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py) | 256x192 | 0.722 | 0.894 | 0.794 | 0.768 | 0.930 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192_rle-16c3d461_20220615.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192_rle_20220615.log.json) | +| [deeppose_resnet_152_rle](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py) | 256x192 | 0.731 | 0.897 | 0.805 | 0.777 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192_rle-c05bdccf_20220615.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192_rle_20220615.log.json) | +| [deeppose_resnet_152_rle](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py) | 384x288 | 0.749 | 0.901 | 0.815 | 0.793 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_384x288_rle-b77c4c37_20220624.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_384x288_rle_20220624.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.yml new file mode 100644 index 0000000..dd910e6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.yml @@ -0,0 +1,90 @@ +Collections: +- Name: RLE + Paper: + Title: Human pose regression with residual log-likelihood estimation + URL: https://arxiv.org/abs/2107.11291 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/rle.md +Models: +- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py + In Collection: RLE + Metadata: + Architecture: &id001 + - DeepPose + - RLE + - ResNet + Training Data: COCO + Name: td-reg_res50_rle-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.706 + AP@0.5: 0.888 + AP@0.75: 0.776 + AR: 0.753 + AR@0.5: 0.924 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192-d37efd64_20220913.pth +- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py + In Collection: RLE + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.719 + AP@0.5: 0.891 + AP@0.75: 0.788 + AR: 0.764 + AR@0.5: 0.925 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192-2cb494ee_20220913.pth +- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py + In Collection: RLE + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-reg_res101_rle-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.722 + AP@0.5: 0.894 + AP@0.75: 0.794 + AR: 0.768 + AR@0.5: 0.93 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192_rle-16c3d461_20220615.pth +- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py + In Collection: RLE + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-reg_res152_rle-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.731 + AP@0.5: 0.897 + AP@0.75: 0.805 + AR: 0.777 + AR@0.5: 0.933 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192_rle-c05bdccf_20220615.pth +- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py + In Collection: RLE + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-reg_res152_rle-8xb64-210e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.749 + AP@0.5: 0.901 + AP@0.75: 0.815 + AR: 0.793 + AR@0.5: 0.935 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_384x288_rle-b77c4c37_20220624.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..c1a2232 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py @@ -0,0 +1,126 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(192, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MobileNetV2', + widen_factor=1., + out_indices=(7, ), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/top_down/' + 'mobilenetv2/mobilenetv2_coco_256x192-d1e58e7b_20200727.pth')), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RLEHead', + in_channels=1280, + num_joints=17, + loss=dict(type='RLELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + ), +) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json', + score_mode='bbox_rle') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..e55f676 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(192, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=17, + loss=dict(type='SmoothL1Loss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..b18ea03 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(192, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RLEHead', + in_channels=2048, + num_joints=17, + loss=dict(type='RLELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json', + score_mode='bbox_rle') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..64c621f --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(192, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=17, + loss=dict(type='SmoothL1Loss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..fa35cfa --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(192, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RLEHead', + in_channels=2048, + num_joints=17, + loss=dict(type='RLELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json', + score_mode='bbox_rle') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py new file mode 100644 index 0000000..06fa1c7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(288, 384)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RLEHead', + in_channels=2048, + num_joints=17, + loss=dict(type='RLELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json', + score_mode='bbox_rle') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..09016f6 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(192, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=17, + loss=dict(type='SmoothL1Loss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..ceccb7a --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(192, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RLEHead', + in_channels=2048, + num_joints=17, + loss=dict(type='RLELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json', + score_mode='bbox_rle') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..a1d485c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=train_cfg['max_epochs'], + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(192, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RLEHead', + in_channels=2048, + num_joints=17, + loss=dict(type='RLELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +test_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file=f'{data_root}person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=f'{data_root}annotations/person_keypoints_val2017.json', + score_mode='bbox_rle') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.md new file mode 100644 index 0000000..af6df37 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.md @@ -0,0 +1,58 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [deeppose_resnet_50](/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.826 | 0.180 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256-c63cd0b6_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256_20210203.log.json) | +| [deeppose_resnet_101](/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.841 | 0.200 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_mpii_256x256-87516a90_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_mpii_256x256_20210205.log.json) | +| [deeppose_resnet_152](/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py) | 256x256 | 0.850 | 0.208 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_mpii_256x256-15f5e6f9_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_mpii_256x256_20210205.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.yml new file mode 100644 index 0000000..95484bc --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.yml @@ -0,0 +1,42 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py + In Collection: DeepPose + Metadata: + Architecture: &id001 + - DeepPose + - ResNet + Training Data: MPII + Name: td-reg_res50_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.826 + Mean@0.1: 0.18 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256-c63cd0b6_20210203.pth +- Config: configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py + In Collection: DeepPose + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-reg_res101_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.841 + Mean@0.1: 0.2 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_mpii_256x256-87516a90_20210205.pth +- Config: configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py + In Collection: DeepPose + Metadata: + Architecture: *id001 + Training Data: MPII + Name: td-reg_res152_8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.85 + Mean@0.1: 0.208 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_mpii_256x256-15f5e6f9_20210205.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.md new file mode 100644 index 0000000..9e88cfa --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.md @@ -0,0 +1,73 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+RLE (ICCV'2021) + +```bibtex +@inproceedings{li2021human, + title={Human pose regression with residual log-likelihood estimation}, + author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={11025--11034}, + year={2021} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [deeppose_resnet_50_rle](/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py) | 256x256 | 0.861 | 0.277 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256_rle-5f92a619_20220504.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256_rle_20220504.log.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.yml new file mode 100644 index 0000000..c2d3237 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py + In Collection: RLE + Metadata: + Architecture: + - DeepPose + - RLE + - ResNet + Training Data: MPII + Name: td-reg_res50_rle-8xb64-210e_mpii-256x256 + Results: + - Dataset: MPII + Metrics: + Mean: 0.861 + Mean@0.1: 0.277 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256_rle-5f92a619_20220504.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..1576670 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py @@ -0,0 +1,116 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(256, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=16, + loss=dict(type='SmoothL1Loss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file=f'{data_root}/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..f814b67 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py @@ -0,0 +1,118 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(256, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=16, + loss=dict(type='SmoothL1Loss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +file_client_args = dict(backend='disk') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file=f'{data_root}/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..a2ab46c --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py @@ -0,0 +1,116 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(256, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=16, + loss=dict(type='SmoothL1Loss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file=f'{data_root}/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000..922cee2 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py @@ -0,0 +1,116 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(256, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RLEHead', + in_channels=2048, + num_joints=16, + loss=dict(type='RLELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform', shift_prob=0), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file=f'{data_root}/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/README.md b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/README.md new file mode 100644 index 0000000..cce6151 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/README.md @@ -0,0 +1,22 @@ +# YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss + + + +
+YOLO-Pose (CVPRW'2022) + +```bibtex +@inproceedings{maji2022yolo, + title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss}, + author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2637--2646}, + year={2022} +} +``` + +
+ +YOLO-Pose is a bottom-up pose estimation approach that simultaneously detects all person instances and regresses keypoint locations in a single pass. + +We implement **YOLOX-Pose** based on the **YOLOX** object detection framework and inherits the benefits of unified pose estimation and object detection from YOLO-pose. To predict keypoint locations more accurately, separate branches with adaptive convolutions are used to regress the offsets for different joints. This allows optimizing the feature extraction for each keypoint. diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md new file mode 100644 index 0000000..4d1b9e7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md @@ -0,0 +1,59 @@ + + +
+YOLO-Pose (CVPRW'2022) + +```bibtex +@inproceedings{maji2022yolo, + title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss}, + author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2637--2646}, + year={2022} +} +``` + +
+ + + +
+YOLOX + +```bibtex +@article{ge2021yolox, + title={Yolox: Exceeding yolo series in 2021}, + author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian}, + journal={arXiv preprint arXiv:2107.08430}, + year={2021} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [yoloxpose_tiny](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py) | 416x416 | 0.526 | 0.793 | 0.556 | 0.571 | 0.833 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-76eb44ca_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-20230829.json) | +| [yoloxpose_s](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py) | 640x640 | 0.641 | 0.872 | 0.702 | 0.682 | 0.902 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-56c79c1f_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-20230829.json) | +| [yoloxpose_m](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py) | 640x640 | 0.695 | 0.899 | 0.766 | 0.733 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-84e9a538_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-20230829.json) | +| [yoloxpose_l](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py) | 640x640 | 0.712 | 0.901 | 0.782 | 0.749 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-de0f8dee_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-20230829.json) | diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml new file mode 100644 index 0000000..400807e --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml @@ -0,0 +1,72 @@ +Collections: +- Name: YOLOXPose + Paper: + Title: 'YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss' + URL: https://arxiv.org/abs/2204.06806 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/yolopose.md +Models: +- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py + In Collection: YOLOXPose + Metadata: + Architecture: &id001 + - YOLOXPose + Training Data: COCO + Name: yoloxpose_tiny_4xb64-300e_coco-416 + Results: + - Dataset: COCO + Metrics: + AP: 0.526 + AP@0.5: 0.793 + AP@0.75: 0.556 + AR: 0.571 + AR@0.5: 0.833 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-76eb44ca_20230829.pth +- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py + In Collection: YOLOXPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: yoloxpose_s_8xb32-300e_coco-640 + Results: + - Dataset: COCO + Metrics: + AP: 0.641 + AP@0.5: 0.872 + AP@0.75: 0.702 + AR: 0.682 + AR@0.5: 0.902 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-56c79c1f_20230829.pth +- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py + In Collection: YOLOXPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: yoloxpose_m_8xb32-300e_coco-640 + Results: + - Dataset: COCO + Metrics: + AP: 0.695 + AP@0.5: 0.899 + AP@0.75: 0.766 + AR: 0.733 + AR@0.5: 0.926 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-84e9a538_20230829.pth +- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py + In Collection: YOLOXPose + Metadata: + Architecture: *id001 + Training Data: COCO + Name: yoloxpose_l_8xb32-300e_coco-640 + Results: + - Dataset: COCO + Metrics: + AP: 0.712 + AP@0.5: 0.901 + AP@0.75: 0.782 + AR: 0.749 + AR@0.5: 0.926 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-de0f8dee_20230829.pth diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py new file mode 100644 index 0000000..a339ed9 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py @@ -0,0 +1,17 @@ +_base_ = './yoloxpose_s_8xb32-300e_coco-640.py' + +widen_factor = 1 +deepen_factor = 1 +checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_' \ + 'l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth' + +# model settings +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint), + ), + neck=dict( + in_channels=[256, 512, 1024], out_channels=256, num_csp_blocks=3), + head=dict(head_module_cfg=dict(widen_factor=widen_factor))) diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py new file mode 100644 index 0000000..613cfc5 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py @@ -0,0 +1,16 @@ +_base_ = './yoloxpose_s_8xb32-300e_coco-640.py' + +widen_factor = 0.75 +deepen_factor = 0.67 +checkpoint = 'https://download.openmmlab.com/mmpose/v1/pretrained_models/' \ + 'yolox_m_8x8_300e_coco_20230829.pth' + +# model settings +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint), + ), + neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2), + head=dict(head_module_cfg=dict(widen_factor=widen_factor))) diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py new file mode 100644 index 0000000..3c935ae --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py @@ -0,0 +1,266 @@ +_base_ = '../../../_base_/default_runtime.py' + +# runtime +train_cfg = dict( + _delete_=True, + type='EpochBasedTrainLoop', + max_epochs=300, + val_interval=10, + dynamic_intervals=[(280, 1)]) + +auto_scale_lr = dict(base_batch_size=256) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3)) + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, + bias_decay_mult=0, + bypass_duplicate=True, + ), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0002, + begin=5, + T_max=280, + end=280, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=280, end=300), +] + +# model +widen_factor = 0.5 +deepen_factor = 0.33 + +model = dict( + type='BottomupPoseEstimator', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + data_preprocessor=dict( + type='PoseDataPreprocessor', + pad_size_divisor=32, + mean=[0, 0, 0], + std=[1, 1, 1], + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1), + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v2.0/' + 'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_' + '20211121_095711-4592a793.pth', + prefix='backbone.', + )), + neck=dict( + type='YOLOXPAFPN', + in_channels=[128, 256, 512], + out_channels=128, + num_csp_blocks=1, + use_depthwise=False, + upsample_cfg=dict(scale_factor=2, mode='nearest'), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + head=dict( + type='YOLOXPoseHead', + num_keypoints=17, + featmap_strides=(8, 16, 32), + head_module_cfg=dict( + num_classes=1, + in_channels=256, + feat_channels=256, + widen_factor=widen_factor, + stacked_convs=2, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + prior_generator=dict( + type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]), + assigner=dict(type='SimOTAAssigner', dynamic_k_indicator='oks'), + overlaps_power=0.5, + loss_cls=dict(type='BCELoss', reduction='sum', loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_obj=dict( + type='BCELoss', + use_target_weight=True, + reduction='sum', + loss_weight=1.0), + loss_oks=dict( + type='OKSLoss', + reduction='none', + metainfo='configs/_base_/datasets/coco.py', + norm_target_weight=True, + loss_weight=30.0), + loss_vis=dict( + type='BCELoss', + use_target_weight=True, + reduction='mean', + loss_weight=1.0), + loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0), + ), + test_cfg=dict( + score_thr=0.01, + nms_thr=0.65, + )) + +# data +input_size = (640, 640) +codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size) + +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict( + type='YOLOXMixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +train_pipeline_stage2 = [ + dict(type='LoadImage'), + dict( + type='BottomupRandomAffine', + input_size=(640, 640), + shift_prob=0, + rotate_prob=0, + scale_prob=0, + scale_type='long', + pad_val=(114, 114, 114), + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] + +data_mode = 'bottomup' +data_root = 'data/' + +dataset_coco = dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + filter_cfg=dict(filter_empty_gt=False, min_size=32), + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='coco/train2017/'), + pipeline=train_pipeline_stage1, +) + +train_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dataset_coco) + +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json', + score_mode='bbox', + nms_mode='none', +) +test_evaluator = val_evaluator + +custom_hooks = [ + dict( + type='YOLOXPoseModeSwitchHook', + num_last_epochs=20, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), +] diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py new file mode 100644 index 0000000..68a5ad7 --- /dev/null +++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py @@ -0,0 +1,77 @@ +_base_ = './yoloxpose_s_8xb32-300e_coco-640.py' + +# model settings +widen_factor = 0.375 +deepen_factor = 0.33 +checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_' \ + 'tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth' + +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=1), + ]), + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint), + ), + neck=dict( + in_channels=[96, 192, 384], + out_channels=96, + ), + head=dict(head_module_cfg=dict(widen_factor=widen_factor), )) + +# dataset settings +train_pipeline_stage1 = [ + dict(type='LoadImage', backend_args=None), + dict( + type='Mosaic', + img_scale=_base_.input_size, + pad_val=114.0, + pre_transform=[dict(type='LoadImage', backend_args=None)]), + dict( + type='BottomupRandomAffine', + input_size=_base_.input_size, + shift_factor=0.1, + rotate_factor=10, + scale_factor=(0.75, 1.0), + pad_val=114, + distribution='uniform', + transform_mode='perspective', + bbox_keep_corner=False, + clip_border=True, + ), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip'), + dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), + dict(type='GenerateTarget', encoder=_base_.codec), + dict( + type='PackPoseInputs', + extra_mapping_labels={ + 'bbox': 'bboxes', + 'bbox_labels': 'labels', + 'keypoints': 'keypoints', + 'keypoints_visible': 'keypoints_visible', + 'area': 'areas' + }), +] +train_dataloader = dict( + batch_size=64, dataset=dict(pipeline=train_pipeline_stage1)) + +input_size = (416, 416) +val_pipeline = [ + dict(type='LoadImage'), + dict( + type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale')) +] + +val_dataloader = dict(dataset=dict(pipeline=val_pipeline, )) +test_dataloader = val_dataloader diff --git a/modules/rtmpose/configs/body_3d_keypoint/README.md b/modules/rtmpose/configs/body_3d_keypoint/README.md new file mode 100644 index 0000000..0010706 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/README.md @@ -0,0 +1,13 @@ +# Human Body 3D Pose Estimation + +3D pose estimation is the detection and analysis of X, Y, Z coordinates of human body joints from RGB images. For single-person 3D pose estimation from a monocular camera, existing works can be classified into three categories: (1) from 2D poses to 3D poses (2D-to-3D pose lifting) (2) jointly learning 2D and 3D poses, and (3) directly regressing 3D poses from images. + +## Data preparation + +Please follow [DATA Preparation](/docs/en/dataset_zoo/3d_body_keypoint.md) to prepare data. + +## Demo + +Please follow [Demo](/demo/docs/en/3d_human_pose_demo.md) to run demos. + +
diff --git a/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/README.md b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/README.md new file mode 100644 index 0000000..78507e0 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/README.md @@ -0,0 +1,13 @@ +# A simple yet effective baseline for 3d human pose estimation + +Simple 3D baseline proposes to break down the task of 3d human pose estimation into 2 stages: (1) Image → 2D pose (2) 2D pose → 3D pose. + +The authors find that "lifting" ground truth 2D joint locations to 3D space is a task that can be solved with a low error rate. Based on the success of 2d human pose estimation, it directly "lifts" 2d joint locations to 3d space. + +## Results and Models + +### Human3.6m Dataset + +| Arch | MPJPE | P-MPJPE | ckpt | log | Details and Download | +| :------------------------------------------ | :---: | :-----: | :-----------------------------------------: | :-----------------------------------------: | :---------------------------------------------------------: | +| [SimpleBaseline3D](/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py) | 43.4 | 34.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) | [simplebaseline3d_h36m.md](./h36m/simplebaseline3d_h36m.md) | diff --git a/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py new file mode 100644 index 0000000..15af0f5 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py @@ -0,0 +1,168 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=200, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3)) + +# learning policy +param_scheduler = [ + dict(type='StepLR', step_size=100000, gamma=0.96, end=80, by_epoch=False) +] + +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1)) + +# codec settings +# 3D keypoint normalization parameters +# From file: '{data_root}/annotation_body3d/fps50/joint3d_rel_stats.pkl' +target_mean = [[-2.55652589e-04, -7.11960570e-03, -9.81433052e-04], + [-5.65463051e-03, 3.19636009e-01, 7.19329269e-02], + [-1.01705840e-02, 6.91147892e-01, 1.55352986e-01], + [2.55651315e-04, 7.11954606e-03, 9.81423866e-04], + [-5.09729780e-03, 3.27040413e-01, 7.22258095e-02], + [-9.99656606e-03, 7.08277383e-01, 1.58016408e-01], + [2.90583676e-03, -2.11363307e-01, -4.74210915e-02], + [5.67537804e-03, -4.35088906e-01, -9.76974016e-02], + [5.93884964e-03, -4.91891970e-01, -1.10666618e-01], + [7.37352083e-03, -5.83948619e-01, -1.31171400e-01], + [5.41920653e-03, -3.83931702e-01, -8.68145417e-02], + [2.95964662e-03, -1.87567488e-01, -4.34536934e-02], + [1.26585822e-03, -1.20170579e-01, -2.82526049e-02], + [4.67186639e-03, -3.83644089e-01, -8.55125784e-02], + [1.67648571e-03, -1.97007177e-01, -4.31368364e-02], + [8.70569015e-04, -1.68664569e-01, -3.73902498e-02]], +target_std = [[0.11072244, 0.02238818, 0.07246294], + [0.15856311, 0.18933832, 0.20880479], + [0.19179935, 0.24320062, 0.24756193], + [0.11072181, 0.02238805, 0.07246253], + [0.15880454, 0.19977188, 0.2147063], + [0.18001944, 0.25052739, 0.24853247], + [0.05210694, 0.05211406, 0.06908241], + [0.09515367, 0.10133032, 0.12899733], + [0.11742458, 0.12648469, 0.16465091], + [0.12360297, 0.13085539, 0.16433336], + [0.14602232, 0.09707956, 0.13952731], + [0.24347532, 0.12982249, 0.20230181], + [0.2446877, 0.21501816, 0.23938235], + [0.13876084, 0.1008926, 0.1424411], + [0.23687529, 0.14491219, 0.20980829], + [0.24400695, 0.23975028, 0.25520584]] +# 2D keypoint normalization parameters +# From file: '{data_root}/annotation_body3d/fps50/joint2d_stats.pkl' +keypoints_mean = [[532.08351635, 419.74137558], [531.80953144, 418.2607141], + [530.68456967, 493.54259285], [529.36968722, 575.96448516], + [532.29767646, 421.28483336], [531.93946631, 494.72186795], + [529.71984447, 578.96110365], [532.93699382, 370.65225054], + [534.1101856, 317.90342311], [534.55416813, 304.24143901], + [534.86955004, 282.31030885], [534.11308566, 330.11296796], + [533.53637525, 376.2742511], [533.49380107, 391.72324565], + [533.52579142, 330.09494668], [532.50804964, 374.190479], + [532.72786934, 380.61615716]], +keypoints_std = [[107.73640054, 63.35908715], [119.00836213, 64.1215443], + [119.12412107, 50.53806215], [120.61688045, 56.38444891], + [101.95735275, 62.89636486], [106.24832897, 48.41178119], + [108.46734966, 54.58177071], [109.07369806, 68.70443672], + [111.20130351, 74.87287863], [111.63203838, 77.80542514], + [113.22330788, 79.90670556], [105.7145833, 73.27049436], + [107.05804267, 73.93175781], [107.97449418, 83.30391802], + [121.60675105, 74.25691526], [134.34378973, 77.48125087], + [131.79990652, 89.86721124]] +codec = dict( + type='ImagePoseLifting', + num_keypoints=17, + root_index=0, + remove_root=True, + target_mean=target_mean, + target_std=target_std, + keypoints_mean=keypoints_mean, + keypoints_std=keypoints_std) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=2, + kernel_sizes=(1, 1, 1), + dropout=0.5, + ), + head=dict( + type='TemporalRegressionHead', + in_channels=1024, + num_joints=16, + loss=dict(type='MSELoss'), + decoder=codec, + )) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root', 'target_root_index', 'target_mean', + 'target_std')) +] +val_pipeline = train_pipeline + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=1, + causal=True, + keypoint_2d_src='gt', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=1, + causal=True, + keypoint_2d_src='gt', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe'), + dict(type='MPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.md b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.md new file mode 100644 index 0000000..e710a15 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.md @@ -0,0 +1,44 @@ + + +
+SimpleBaseline3D (ICCV'2017) + +```bibtex +@inproceedings{martinez_2017_3dbaseline, + title={A simple yet effective baseline for 3d human pose estimation}, + author={Martinez, Julieta and Hossain, Rayat and Romero, Javier and Little, James J.}, + booktitle={ICCV}, + year={2017} +} +``` + +
+ + + +
+Human3.6M (TPAMI'2014) + +```bibtex +@article{h36m_pami, + author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian}, + title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + publisher = {IEEE Computer Society}, + volume = {36}, + number = {7}, + pages = {1325-1339}, + month = {jul}, + year = {2014} +} +``` + +
+ +Results on Human3.6M dataset with ground truth 2D detections + +| Arch | MPJPE | P-MPJPE | ckpt | log | +| :-------------------------------------------------------------- | :---: | :-----: | :-------------------------------------------------------------: | :------------------------------------------------------------: | +| [SimpleBaseline3D1](/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py) | 43.4 | 34.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) | + +1 Differing from the original paper, we didn't apply the `max-norm constraint` because we found this led to a better convergence and performance. diff --git a/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.yml b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.yml new file mode 100644 index 0000000..c06b349 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.yml @@ -0,0 +1,21 @@ +Collections: +- Name: SimpleBaseline3D + Paper: + Title: A simple yet effective baseline for 3d human pose estimation + URL: http://openaccess.thecvf.com/content_iccv_2017/html/Martinez_A_Simple_yet_ICCV_2017_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/simplebaseline3d.md +Models: +- Config: configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py + In Collection: SimpleBaseline3D + Metadata: + Architecture: &id001 + - SimpleBaseline3D + Training Data: Human3.6M + Name: image-pose-lift_tcn_8xb64-200e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 43.4 + P-MPJPE: 34.3 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/README.md b/modules/rtmpose/configs/body_3d_keypoint/motionbert/README.md new file mode 100644 index 0000000..d6a96e3 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/README.md @@ -0,0 +1,23 @@ +# MotionBERT: A Unified Perspective on Learning Human Motion Representations + +Motionbert proposes a pretraining stage in which a motion encoder is trained to recover the underlying 3D motion from noisy partial 2D observations. The motion representations acquired in this way incorporate geometric, kinematic, and physical knowledge about human motion, which can be easily transferred to multiple downstream tasks. + +## Results and Models + +### Human3.6m Dataset + +| Arch | MPJPE | P-MPJPE | ckpt | log | Details and Download | +| :-------------------------------------------------------------------- | :---: | :-----: | :-------------------------------------------------------------------: | :-: | :---------------------------------------------: | +| [MotionBERT\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py) | 35.3 | 27.7 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) | +| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py) | 27.5 | 21.6 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) | + +### Human3.6m Dataset from official repo 1 + +| Arch | MPJPE | Average MPJPE | P-MPJPE | ckpt | log | Details and Download | +| :------------------------------------------------------------- | :---: | :-----------: | :-----: | :-------------------------------------------------------------: | :-: | :---------------------------------------------: | +| [MotionBERT\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py) | 39.8 | 39.2 | 33.4 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) | +| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py) | 37.7 | 37.2 | 32.2 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) | + +1 Please refer to the [doc](./h36m/motionbert_h36m.md) for more details. + +*Models with * are converted from the official repo. The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py new file mode 100644 index 0000000..30ab90d --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py @@ -0,0 +1,137 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=240, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.99, end=120, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +train_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train') +val_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='DSTFormer', + in_channels=3, + feat_size=512, + depth=5, + num_heads=8, + mlp_ratio=2, + seq_len=243, + att_fuse=True, + ), + head=dict( + type='MotionRegressionHead', + in_channels=512, + out_channels=3, + embedding_size=512, + loss=dict(type='MPJPEVelocityJointLoss'), + decoder=val_codec, + ), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict(type='GenerateTarget', encoder=train_codec), + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(center_mode='static', center_x=0.), + target_flip_cfg=dict(center_mode='static', center_x=0.), + flip_label=True), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=val_codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] + +# data loaders +train_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train_original.npz', + seq_len=1, + multiple_target=243, + multiple_target_step=81, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) + +val_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test_original.npz', + factor_file='annotation_body3d/fps50/h36m_factors.npy', + seq_len=1, + seq_step=1, + multiple_target=243, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +skip_list = [ + 'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1' +] +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe', skip_list=skip_list), + dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list) +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py new file mode 100644 index 0000000..6909ec2 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py @@ -0,0 +1,136 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=240, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.99, end=120, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +train_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train') +val_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='DSTFormer', + in_channels=3, + feat_size=512, + depth=5, + num_heads=8, + mlp_ratio=2, + seq_len=243, + att_fuse=True, + ), + head=dict( + type='MotionRegressionHead', + in_channels=512, + out_channels=3, + embedding_size=512, + loss=dict(type='MPJPEVelocityJointLoss'), + decoder=val_codec, + ), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict(type='GenerateTarget', encoder=train_codec), + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(center_mode='static', center_x=0.), + target_flip_cfg=dict(center_mode='static', center_x=0.), + flip_label=True), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=val_codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] + +# data loaders +train_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=1, + multiple_target=243, + multiple_target_step=81, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) + +val_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=1, + seq_step=1, + multiple_target=243, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +skip_list = [ + 'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1' +] +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe', skip_list=skip_list), + dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list) +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py new file mode 100644 index 0000000..5db0db7 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py @@ -0,0 +1,142 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=120, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +train_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train') +val_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='DSTFormer', + in_channels=3, + feat_size=512, + depth=5, + num_heads=8, + mlp_ratio=2, + seq_len=243, + att_fuse=True, + ), + head=dict( + type='MotionRegressionHead', + in_channels=512, + out_channels=3, + embedding_size=512, + loss=dict(type='MPJPEVelocityJointLoss'), + decoder=val_codec, + ), + test_cfg=dict(flip_test=True), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/' + 'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'), +) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict(type='GenerateTarget', encoder=train_codec), + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(center_mode='static', center_x=0.), + target_flip_cfg=dict(center_mode='static', center_x=0.), + flip_label=True), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=val_codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] + +# data loaders +train_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train_original.npz', + seq_len=1, + multiple_target=243, + multiple_target_step=81, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) + +val_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test_original.npz', + factor_file='annotation_body3d/fps50/h36m_factors.npy', + seq_len=1, + seq_step=1, + multiple_target=243, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +skip_list = [ + 'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1' +] +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe', skip_list=skip_list), + dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list) +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py new file mode 100644 index 0000000..47f050b --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py @@ -0,0 +1,141 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=120, val_interval=10) + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +train_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train') +val_codec = dict( + type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='DSTFormer', + in_channels=3, + feat_size=512, + depth=5, + num_heads=8, + mlp_ratio=2, + seq_len=243, + att_fuse=True, + ), + head=dict( + type='MotionRegressionHead', + in_channels=512, + out_channels=3, + embedding_size=512, + loss=dict(type='MPJPEVelocityJointLoss'), + decoder=val_codec, + ), + test_cfg=dict(flip_test=True), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/' + 'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'), +) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict(type='GenerateTarget', encoder=train_codec), + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(center_mode='static', center_x=0.), + target_flip_cfg=dict(center_mode='static', center_x=0.), + flip_label=True), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=val_codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'factor', 'camera_param')) +] + +# data loaders +train_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=1, + multiple_target=243, + multiple_target_step=81, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) + +val_dataloader = dict( + batch_size=32, + prefetch_factor=4, + pin_memory=True, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=1, + seq_step=1, + multiple_target=243, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +skip_list = [ + 'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1' +] +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe', skip_list=skip_list), + dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list) +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.md b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.md new file mode 100644 index 0000000..0d3ee29 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.md @@ -0,0 +1,55 @@ + + +
+MotionBERT (2022) + +```bibtex + @misc{Zhu_Ma_Liu_Liu_Wu_Wang_2022, + title={Learning Human Motion Representations: A Unified Perspective}, + author={Zhu, Wentao and Ma, Xiaoxuan and Liu, Zhaoyang and Liu, Libin and Wu, Wayne and Wang, Yizhou}, + year={2022}, + month={Oct}, + language={en-US} + } +``` + +
+ + + +
+Human3.6M (TPAMI'2014) + +```bibtex +@article{h36m_pami, +author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian}, +title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments}, +journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, +publisher = {IEEE Computer Society}, +volume = {36}, +number = {7}, +pages = {1325-1339}, +month = {jul}, +year = {2014} +} +``` + +
+ +Results on Human3.6M dataset with ground truth 2D detections + +| Arch | MPJPE | average MPJPE | P-MPJPE | ckpt | +| :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: | +| [MotionBERT\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py) | 34.5 | 34.6 | 27.1 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | +| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py) | 26.9 | 26.8 | 21.0 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | + +Results on Human3.6M dataset converted from the [official repo](https://github.com/Walter0807/MotionBERT)1 with ground truth 2D detections + +| Arch | MPJPE | average MPJPE | P-MPJPE | ckpt | log | +| :------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :------------------------------------------------------------------------------------: | :-: | +| [MotionBERT\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py) | 39.8 | 39.2 | 33.4 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | / | +| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py) | 37.7 | 37.2 | 32.2 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | / | + +1 By default, we test models with [Human 3.6m dataset](/docs/en/dataset_zoo/3d_body_keypoint.md#human3-6m) processed by MMPose. The official repo's dataset includes more data and applies a different pre-processing technique. To achieve the same result with the official repo, please download the [test annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_test_original.npz), [train annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_train_original.npz) and [factors](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_factors.npy) under `$MMPOSE/data/h36m/annotation_body3d/fps50` and test with the configs we provided. + +*Models with * are converted from the [official repo](https://github.com/Walter0807/MotionBERT). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.yml b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.yml new file mode 100644 index 0000000..93c4eda --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.yml @@ -0,0 +1,45 @@ +Collections: +- Name: MotionBERT + Paper: + Title: "Learning Human Motion Representations: A Unified Perspective" + URL: https://arxiv.org/abs/2210.06551 + README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/motionbert.md +Models: +- Config: configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py + In Collection: MotionBERT + Metadata: + Architecture: &id001 + - MotionBERT + Training Data: Human3.6M (MotionBERT) + Name: motionbert_dstformer-243frm_8xb32-240e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 34.5 + P-MPJPE: 27.1 + Task: Body 3D Keypoint + - Dataset: Human3.6M (MotionBERT) + Metrics: + MPJPE: 39.8 + P-MPJPE: 33.4 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth +- Config: configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py + In Collection: MotionBERT + Alias: human3d + Metadata: + Architecture: *id001 + Training Data: Human3.6M (MotionBERT) + Name: motionbert_dstformer-ft-243frm_8xb32-120e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 26.9 + P-MPJPE: 21.0 + Task: Body 3D Keypoint + - Dataset: Human3.6M (MotionBERT) + Metrics: + MPJPE: 37.7 + P-MPJPE: 32.2 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/README.md b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/README.md new file mode 100644 index 0000000..de32a06 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/README.md @@ -0,0 +1,17 @@ +# 3D human pose estimation in video with temporal convolutions and semi-supervised training + +Based on the success of 2d human pose estimation, it directly "lifts" a sequence of 2d keypoints to 3d keypoints. + +## Results and Models + +### Human3.6m Dataset + +| Arch | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | Details and Download | +| :-------------------------------------------- | :---: | :-----: | :-----: | :-------------------------------------------: | :------------------------------------------: | :---------------------------------------------: | +| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py) | 40.1 | 30.1 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py) | 39.1 | 29.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py) | 37.6 | 28.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py) | 53.0 | 41.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 47.9 | 38.0 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py) | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | +| [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) | diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py new file mode 100644 index 0000000..3e164fd --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py @@ -0,0 +1,132 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=160, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-4)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.98, end=80, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=1024) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +codec = dict( + type='VideoPoseLifting', + num_keypoints=17, + zero_center=True, + root_index=0, + remove_root=False) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=4, + kernel_sizes=(1, 1, 1, 1, 1), + dropout=0.25, + use_stride_conv=True, + ), + head=dict( + type='TemporalRegressionHead', + in_channels=1024, + num_joints=17, + loss=dict(type='MPJPELoss'), + decoder=codec, + )) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(), + target_flip_cfg=dict(), + ), + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] + +# data loaders +train_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=1, + causal=False, + pad_video_seq=False, + keypoint_2d_src='detection', + keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_train.npy', + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + ), +) +val_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=1, + causal=False, + pad_video_seq=False, + keypoint_2d_src='detection', + keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_test.npy', + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe'), + dict(type='MPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py new file mode 100644 index 0000000..592eac9 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py @@ -0,0 +1,132 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=200, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-4)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.98, end=200, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=1024) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +codec = dict( + type='VideoPoseLifting', + num_keypoints=17, + zero_center=True, + root_index=0, + remove_root=False) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=4, + kernel_sizes=(3, 3, 3, 3, 3), + dropout=0.25, + use_stride_conv=True, + ), + head=dict( + type='TemporalRegressionHead', + in_channels=1024, + num_joints=17, + loss=dict(type='MPJPELoss'), + decoder=codec, + )) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(), + target_flip_cfg=dict(), + ), + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] + +# data loaders +train_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=243, + causal=False, + pad_video_seq=True, + keypoint_2d_src='detection', + keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_train.npy', + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + ), +) +val_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=243, + causal=False, + pad_video_seq=True, + keypoint_2d_src='detection', + keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_test.npy', + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe'), + dict(type='MPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py new file mode 100644 index 0000000..edcf918 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py @@ -0,0 +1,128 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=160, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.975, end=80, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=1024) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +codec = dict( + type='VideoPoseLifting', + num_keypoints=17, + zero_center=True, + root_index=0, + remove_root=False) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=4, + kernel_sizes=(3, 3, 3, 3, 3), + dropout=0.25, + use_stride_conv=True, + ), + head=dict( + type='TemporalRegressionHead', + in_channels=1024, + num_joints=17, + loss=dict(type='MPJPELoss'), + decoder=codec, + )) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(), + target_flip_cfg=dict(), + ), + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] + +# data loaders +train_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=243, + causal=False, + pad_video_seq=True, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + ), +) +val_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=243, + causal=False, + pad_video_seq=True, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe'), + dict(type='MPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py new file mode 100644 index 0000000..842bee6 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py @@ -0,0 +1,119 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = None + +# optimizer + +# learning policy + +auto_scale_lr = dict(base_batch_size=1024) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +codec = dict( + type='VideoPoseLifting', + num_keypoints=17, + zero_center=True, + root_index=0, + remove_root=False) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=2, + kernel_sizes=(3, 3, 3), + dropout=0.25, + use_stride_conv=True, + ), + head=dict( + type='TemporalRegressionHead', + in_channels=1024, + num_joints=17, + loss=dict(type='MPJPELoss'), + decoder=codec, + ), + traj_backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=2, + kernel_sizes=(3, 3, 3), + dropout=0.25, + use_stride_conv=True, + ), + traj_head=dict( + type='TrajectoryRegressionHead', + in_channels=1024, + num_joints=1, + loss=dict(type='MPJPELoss', use_target_weight=True), + decoder=codec, + ), + semi_loss=dict( + type='SemiSupervisionLoss', + joint_parents=[0, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15], + warmup_iterations=1311376 // 64 // 8 * 5), +) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +val_pipeline = [ + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] + +# data loaders +val_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=27, + causal=False, + pad_video_seq=True, + keypoint_2d_src='detection', + keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_test.npy', + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe'), + dict(type='MPJPE', mode='p-mpjpe'), + dict(type='MPJPE', mode='n-mpjpe') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py new file mode 100644 index 0000000..7f0e68c --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py @@ -0,0 +1,117 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = None + +# optimizer + +# learning policy + +auto_scale_lr = dict(base_batch_size=1024) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +codec = dict( + type='VideoPoseLifting', + num_keypoints=17, + zero_center=True, + root_index=0, + remove_root=False) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=2, + kernel_sizes=(3, 3, 3), + dropout=0.25, + use_stride_conv=True, + ), + head=dict( + type='TemporalRegressionHead', + in_channels=1024, + num_joints=17, + loss=dict(type='MPJPELoss'), + decoder=codec, + ), + traj_backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=2, + kernel_sizes=(3, 3, 3), + dropout=0.25, + use_stride_conv=True, + ), + traj_head=dict( + type='TrajectoryRegressionHead', + in_channels=1024, + num_joints=1, + loss=dict(type='MPJPELoss', use_target_weight=True), + decoder=codec, + ), + semi_loss=dict( + type='SemiSupervisionLoss', + joint_parents=[0, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15], + warmup_iterations=1311376 // 64 // 8 * 5), +) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +val_pipeline = [ + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] + +# data loaders +val_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=27, + causal=False, + pad_video_seq=True, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe'), + dict(type='MPJPE', mode='p-mpjpe'), + dict(type='MPJPE', mode='n-mpjpe') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py new file mode 100644 index 0000000..9d39fe7 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py @@ -0,0 +1,128 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=160, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.975, end=80, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=1024) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +codec = dict( + type='VideoPoseLifting', + num_keypoints=17, + zero_center=True, + root_index=0, + remove_root=False) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=2, + kernel_sizes=(3, 3, 3), + dropout=0.25, + use_stride_conv=True, + ), + head=dict( + type='TemporalRegressionHead', + in_channels=1024, + num_joints=17, + loss=dict(type='MPJPELoss'), + decoder=codec, + )) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(), + target_flip_cfg=dict(), + ), + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] + +# data loaders +train_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=27, + causal=False, + pad_video_seq=True, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + ), +) +val_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=27, + causal=False, + pad_video_seq=True, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe'), + dict(type='MPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py new file mode 100644 index 0000000..e43326e --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py @@ -0,0 +1,128 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=160, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3)) + +# learning policy +param_scheduler = [ + dict(type='ExponentialLR', gamma=0.975, end=80, by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=1024) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +codec = dict( + type='VideoPoseLifting', + num_keypoints=17, + zero_center=True, + root_index=0, + remove_root=False) + +# model settings +model = dict( + type='PoseLifter', + backbone=dict( + type='TCN', + in_channels=2 * 17, + stem_channels=1024, + num_blocks=3, + kernel_sizes=(3, 3, 3, 3), + dropout=0.25, + use_stride_conv=True, + ), + head=dict( + type='TemporalRegressionHead', + in_channels=1024, + num_joints=17, + loss=dict(type='MPJPELoss'), + decoder=codec, + )) + +# base dataset settings +dataset_type = 'Human36mDataset' +data_root = 'data/h36m/' + +# pipelines +train_pipeline = [ + dict( + type='RandomFlipAroundRoot', + keypoints_flip_cfg=dict(), + target_flip_cfg=dict(), + ), + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] +val_pipeline = [ + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices', + 'target_root')) +] + +# data loaders +train_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_train.npz', + seq_len=81, + causal=False, + pad_video_seq=True, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + ), +) +val_dataloader = dict( + batch_size=128, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotation_body3d/fps50/h36m_test.npz', + seq_len=81, + causal=False, + pad_video_seq=True, + camera_param_file='annotation_body3d/cameras.pkl', + data_root=data_root, + data_prefix=dict(img='images/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='MPJPE', mode='mpjpe'), + dict(type='MPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.md b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.md new file mode 100644 index 0000000..01cef1d --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.md @@ -0,0 +1,67 @@ + + +
+ +VideoPose3D (CVPR'2019) + +```bibtex +@inproceedings{pavllo20193d, +title={3d human pose estimation in video with temporal convolutions and semi-supervised training}, +author={Pavllo, Dario and Feichtenhofer, Christoph and Grangier, David and Auli, Michael}, +booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, +pages={7753--7762}, +year={2019} +} +``` + +
+ + + +
+Human3.6M (TPAMI'2014) + +```bibtex +@article{h36m_pami, +author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian}, +title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments}, +journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, +publisher = {IEEE Computer Society}, +volume = {36}, +number = {7}, +pages = {1325-1339}, +month = {jul}, +year = {2014} +} +``` + +
+ +Testing results on Human3.6M dataset with ground truth 2D detections, supervised training + +| Arch | Receptive Field | MPJPE | P-MPJPE | ckpt | log | +| :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: | +| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py) | 27 | 40.1 | 30.1 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) | +| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py) | 81 | 39.1 | 29.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) | +| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py) | 243 | 37.6 | 28.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) | + +Testing results on Human3.6M dataset with CPN 2D detections1, supervised training + +| Arch | Receptive Field | MPJPE | P-MPJPE | ckpt | log | +| :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: | +| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py) | 1 | 53.0 | 41.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) | +| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 243 | 47.9 | 38.0 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | + +Testing results on Human3.6M dataset with ground truth 2D detections, semi-supervised training + +| Training Data | Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | +| :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: | +| 10% S1 | [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py) | 27 | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | + +Testing results on Human3.6M dataset with CPN 2D detections1, semi-supervised training + +| Training Data | Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | +| :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: | +| 10% S1 | [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 27 | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | + +1 CPN 2D detections are provided by [official repo](https://github.com/facebookresearch/VideoPose3D/blob/master/DATASETS.md). The reformatted version used in this repository can be downloaded from [train_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_train.npy) and [test_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_test.npy). diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.yml b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.yml new file mode 100644 index 0000000..81e66c5 --- /dev/null +++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.yml @@ -0,0 +1,102 @@ +Collections: +- Name: VideoPose3D + Paper: + Title: 3d human pose estimation in video with temporal convolutions and semi-supervised + training + URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Pavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/videopose3d.md +Models: +- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py + In Collection: VideoPose3D + Metadata: + Architecture: &id001 + - VideoPose3D + Training Data: Human3.6M + Name: video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 40.0 + P-MPJPE: 30.1 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth +- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py + In Collection: VideoPose3D + Metadata: + Architecture: *id001 + Training Data: Human3.6M + Name: video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 38.9 + P-MPJPE: 29.2 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth +- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py + In Collection: VideoPose3D + Metadata: + Architecture: *id001 + Training Data: Human3.6M + Name: video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 37.6 + P-MPJPE: 28.3 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth +- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py + In Collection: VideoPose3D + Metadata: + Architecture: *id001 + Training Data: Human3.6M + Name: video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 52.9 + P-MPJPE: 41.3 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth +- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py + In Collection: VideoPose3D + Metadata: + Architecture: *id001 + Training Data: Human3.6M + Name: video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 47.9 + P-MPJPE: 38.0 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth +- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py + In Collection: VideoPose3D + Metadata: + Architecture: *id001 + Training Data: Human3.6M + Name: video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 58.1 + N-MPJPE: 54.7 + P-MPJPE: 42.8 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth +- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py + In Collection: VideoPose3D + Metadata: + Architecture: *id001 + Training Data: Human3.6M + Name: video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m + Results: + - Dataset: Human3.6M + Metrics: + MPJPE: 67.4 + N-MPJPE: 63.2 + P-MPJPE: 50.1 + Task: Body 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/README.md b/modules/rtmpose/configs/face_2d_keypoint/README.md new file mode 100644 index 0000000..058b9b5 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/README.md @@ -0,0 +1,16 @@ +# 2D Face Landmark Detection + +2D face landmark detection (also referred to as face alignment) is defined as the task of detecting the face keypoints from an input image. + +Normally, the input images are cropped face images, where the face locates at the center; +or the rough location (or the bounding box) of the hand is provided. + +## Data preparation + +Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_face_keypoint.md) to prepare data. + +## Demo + +Please follow [Demo](/demo/docs/en/2d_face_demo.md) to run demos. + +
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000..c04d5bc --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/README.md @@ -0,0 +1,32 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### COCO-WholeBody-Face Dataset + +Results on COCO-WholeBody-Face val set + +| Model | Input Size | NME | Details and Download | +| :-------: | :--------: | :----: | :------------------------------------------------------------------------------------: | +| RTMPose-m | 256x256 | 0.0466 | [rtmpose_coco_wholebody_face.md](./coco_wholebody_face/rtmpose_coco_wholebody_face.md) | + +### WFLW Dataset + +Results on WFLW dataset + +| Model | Input Size | NME | Details and Download | +| :-------: | :--------: | :--: | :---------------------------------------: | +| RTMPose-m | 256x256 | 4.01 | [rtmpose_wflw.md](./wflw/rtmpose_wflw.md) | + +### LaPa Dataset + +Results on LaPa dataset + +| Model | Input Size | NME | Details and Download | +| :-------: | :--------: | :--: | :---------------------------------------: | +| RTMPose-m | 256x256 | 1.29 | [rtmpose_lapa.md](./lapa/rtmpose_lapa.md) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py new file mode 100644 index 0000000..07db40c --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py @@ -0,0 +1,231 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 60 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=1) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=68, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyFaceDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='NME', rule='less', max_keep_ckpts=1, interval=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.md new file mode 100644 index 0000000..fb09265 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.md @@ -0,0 +1,39 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO-WholeBody-Face (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Face val set + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [pose_rtmpose_m](/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0466 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-face_pt-aic-coco_60e-256x256-62026ef2_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-face_pt-aic-coco_60e-256x256-62026ef2_20230228.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.yml new file mode 100644 index 0000000..00b0906 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.yml @@ -0,0 +1,14 @@ +Models: +- Config: configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py + In Collection: RTMPose + Metadata: + Architecture: + - RTMPose + Training Data: COCO-WholeBody-Face + Name: rtmpose-m_8xb32-60e_coco-wholebody-face-256x256 + Results: + - Dataset: COCO-WholeBody-Face + Metrics: + NME: 0.0466 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-face_pt-aic-coco_60e-256x256-62026ef2_20230228.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-m_8xb256-120e_face6-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-m_8xb256-120e_face6-256x256.py new file mode 100644 index 0000000..22d28dd --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-m_8xb256-120e_face6-256x256.py @@ -0,0 +1,690 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# lapa coco wflw 300w cofw halpe + +# runtime +max_epochs = 120 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=1) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.005, + begin=30, + end=max_epochs, + T_max=max_epochs - 30, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=106, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'LapaDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.2), + dict(type='MedianBlur', p=0.2), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# train dataset +dataset_lapa = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[], +) + +kpt_68_to_106 = [ + # + (0, 0), + (1, 2), + (2, 4), + (3, 6), + (4, 8), + (5, 10), + (6, 12), + (7, 14), + (8, 16), + (9, 18), + (10, 20), + (11, 22), + (12, 24), + (13, 26), + (14, 28), + (15, 30), + (16, 32), + # + (17, 33), + (18, 34), + (19, 35), + (20, 36), + (21, 37), + # + (22, 42), + (23, 43), + (24, 44), + (25, 45), + (26, 46), + # + (27, 51), + (28, 52), + (29, 53), + (30, 54), + # + (31, 58), + (32, 59), + (33, 60), + (34, 61), + (35, 62), + # + (36, 66), + (39, 70), + # + ((37, 38), 68), + ((40, 41), 72), + # + (42, 75), + (45, 79), + # + ((43, 44), 77), + ((46, 47), 81), + # + (48, 84), + (49, 85), + (50, 86), + (51, 87), + (52, 88), + (53, 89), + (54, 90), + (55, 91), + (56, 92), + (57, 93), + (58, 94), + (59, 95), + (60, 96), + (61, 97), + (62, 98), + (63, 99), + (64, 100), + (65, 101), + (66, 102), + (67, 103) +] + +mapping_halpe = [ + # + (26, 0), + (27, 2), + (28, 4), + (29, 6), + (30, 8), + (31, 10), + (32, 12), + (33, 14), + (34, 16), + (35, 18), + (36, 20), + (37, 22), + (38, 24), + (39, 26), + (40, 28), + (41, 30), + (42, 32), + # + (43, 33), + (44, 34), + (45, 35), + (46, 36), + (47, 37), + # + (48, 42), + (49, 43), + (50, 44), + (51, 45), + (52, 46), + # + (53, 51), + (54, 52), + (55, 53), + (56, 54), + # + (57, 58), + (58, 59), + (59, 60), + (60, 61), + (61, 62), + # + (62, 66), + (65, 70), + # + ((63, 64), 68), + ((66, 67), 72), + # + (68, 75), + (71, 79), + # + ((69, 70), 77), + ((72, 73), 81), + # + (74, 84), + (75, 85), + (76, 86), + (77, 87), + (78, 88), + (79, 89), + (80, 90), + (81, 91), + (82, 92), + (83, 93), + (84, 94), + (85, 95), + (86, 96), + (87, 97), + (88, 98), + (89, 99), + (90, 100), + (91, 101), + (92, 102), + (93, 103) +] + +mapping_wflw = [ + # + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), + (17, 17), + (18, 18), + (19, 19), + (20, 20), + (21, 21), + (22, 22), + (23, 23), + (24, 24), + (25, 25), + (26, 26), + (27, 27), + (28, 28), + (29, 29), + (30, 30), + (31, 31), + (32, 32), + # + (33, 33), + (34, 34), + (35, 35), + (36, 36), + (37, 37), + (38, 38), + (39, 39), + (40, 40), + (41, 41), + # + (42, 42), + (43, 43), + (44, 44), + (45, 45), + (46, 46), + (47, 47), + (48, 48), + (49, 49), + (50, 50), + # + (51, 51), + (52, 52), + (53, 53), + (54, 54), + # + (55, 58), + (56, 59), + (57, 60), + (58, 61), + (59, 62), + # + (60, 66), + (61, 67), + (62, 68), + (63, 69), + (64, 70), + (65, 71), + (66, 72), + (67, 73), + # + (68, 75), + (69, 76), + (70, 77), + (71, 78), + (72, 79), + (73, 80), + (74, 81), + (75, 82), + # + (76, 84), + (77, 85), + (78, 86), + (79, 87), + (80, 88), + (81, 89), + (82, 90), + (83, 91), + (84, 92), + (85, 93), + (86, 94), + (87, 95), + (88, 96), + (89, 97), + (90, 98), + (91, 99), + (92, 100), + (93, 101), + (94, 102), + (95, 103), + # + (96, 104), + # + (97, 105) +] + +mapping_cofw = [ + # + (0, 33), + (2, 38), + (4, 35), + (5, 40), + # + (1, 46), + (3, 50), + (6, 44), + (7, 48), + # + (8, 60), + (10, 64), + (12, 62), + (13, 66), + # + (9, 72), + (11, 68), + (14, 70), + (15, 74), + # + (18, 57), + (19, 63), + (20, 54), + (21, 60), + # + (22, 84), + (23, 90), + (24, 87), + (25, 98), + (26, 102), + (27, 93), + # + (28, 16) +] +dataset_coco = dict( + type='CocoWholeBodyFaceDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw) + ], +) + +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_133kpt.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/lapa.py'), + datasets=[ + dataset_lapa, dataset_coco, dataset_wflw, dataset_300w, + dataset_cofw, dataset_halpe + ], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_test.json', + data_prefix=dict(img='pose/LaPa/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# test dataset +val_lapa = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_test.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[], +) + +val_coco = dict( + type='CocoWholeBodyFaceDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +val_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw) + ], +) + +val_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_test.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +val_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_test.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe) + ], +) + +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/lapa.py'), + datasets=[val_lapa, val_coco, val_wflw, val_300w, val_cofw, val_halpe], + pipeline=val_pipeline, + test_mode=True, + )) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='NME', rule='less', max_keep_ckpts=1, interval=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-s_8xb256-120e_face6-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-s_8xb256-120e_face6-256x256.py new file mode 100644 index 0000000..b18d19d --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-s_8xb256-120e_face6-256x256.py @@ -0,0 +1,691 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# lapa coco wflw 300w cofw halpe + +# runtime +max_epochs = 120 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=1) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.005, + begin=30, + end=max_epochs, + T_max=max_epochs - 30, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e-ea671761.pth') + ), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=106, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'LapaDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.2), + dict(type='MedianBlur', p=0.2), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +# train dataset +dataset_lapa = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[], +) + +kpt_68_to_106 = [ + # + (0, 0), + (1, 2), + (2, 4), + (3, 6), + (4, 8), + (5, 10), + (6, 12), + (7, 14), + (8, 16), + (9, 18), + (10, 20), + (11, 22), + (12, 24), + (13, 26), + (14, 28), + (15, 30), + (16, 32), + # + (17, 33), + (18, 34), + (19, 35), + (20, 36), + (21, 37), + # + (22, 42), + (23, 43), + (24, 44), + (25, 45), + (26, 46), + # + (27, 51), + (28, 52), + (29, 53), + (30, 54), + # + (31, 58), + (32, 59), + (33, 60), + (34, 61), + (35, 62), + # + (36, 66), + (39, 70), + # + ((37, 38), 68), + ((40, 41), 72), + # + (42, 75), + (45, 79), + # + ((43, 44), 77), + ((46, 47), 81), + # + (48, 84), + (49, 85), + (50, 86), + (51, 87), + (52, 88), + (53, 89), + (54, 90), + (55, 91), + (56, 92), + (57, 93), + (58, 94), + (59, 95), + (60, 96), + (61, 97), + (62, 98), + (63, 99), + (64, 100), + (65, 101), + (66, 102), + (67, 103) +] + +mapping_halpe = [ + # + (26, 0), + (27, 2), + (28, 4), + (29, 6), + (30, 8), + (31, 10), + (32, 12), + (33, 14), + (34, 16), + (35, 18), + (36, 20), + (37, 22), + (38, 24), + (39, 26), + (40, 28), + (41, 30), + (42, 32), + # + (43, 33), + (44, 34), + (45, 35), + (46, 36), + (47, 37), + # + (48, 42), + (49, 43), + (50, 44), + (51, 45), + (52, 46), + # + (53, 51), + (54, 52), + (55, 53), + (56, 54), + # + (57, 58), + (58, 59), + (59, 60), + (60, 61), + (61, 62), + # + (62, 66), + (65, 70), + # + ((63, 64), 68), + ((66, 67), 72), + # + (68, 75), + (71, 79), + # + ((69, 70), 77), + ((72, 73), 81), + # + (74, 84), + (75, 85), + (76, 86), + (77, 87), + (78, 88), + (79, 89), + (80, 90), + (81, 91), + (82, 92), + (83, 93), + (84, 94), + (85, 95), + (86, 96), + (87, 97), + (88, 98), + (89, 99), + (90, 100), + (91, 101), + (92, 102), + (93, 103) +] + +mapping_wflw = [ + # + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), + (17, 17), + (18, 18), + (19, 19), + (20, 20), + (21, 21), + (22, 22), + (23, 23), + (24, 24), + (25, 25), + (26, 26), + (27, 27), + (28, 28), + (29, 29), + (30, 30), + (31, 31), + (32, 32), + # + (33, 33), + (34, 34), + (35, 35), + (36, 36), + (37, 37), + (38, 38), + (39, 39), + (40, 40), + (41, 41), + # + (42, 42), + (43, 43), + (44, 44), + (45, 45), + (46, 46), + (47, 47), + (48, 48), + (49, 49), + (50, 50), + # + (51, 51), + (52, 52), + (53, 53), + (54, 54), + # + (55, 58), + (56, 59), + (57, 60), + (58, 61), + (59, 62), + # + (60, 66), + (61, 67), + (62, 68), + (63, 69), + (64, 70), + (65, 71), + (66, 72), + (67, 73), + # + (68, 75), + (69, 76), + (70, 77), + (71, 78), + (72, 79), + (73, 80), + (74, 81), + (75, 82), + # + (76, 84), + (77, 85), + (78, 86), + (79, 87), + (80, 88), + (81, 89), + (82, 90), + (83, 91), + (84, 92), + (85, 93), + (86, 94), + (87, 95), + (88, 96), + (89, 97), + (90, 98), + (91, 99), + (92, 100), + (93, 101), + (94, 102), + (95, 103), + # + (96, 104), + # + (97, 105) +] + +mapping_cofw = [ + # + (0, 33), + (2, 38), + (4, 35), + (5, 40), + # + (1, 46), + (3, 50), + (6, 44), + (7, 48), + # + (8, 60), + (10, 64), + (12, 62), + (13, 66), + # + (9, 72), + (11, 68), + (14, 70), + (15, 74), + # + (18, 57), + (19, 63), + (20, 54), + (21, 60), + # + (22, 84), + (23, 90), + (24, 87), + (25, 98), + (26, 102), + (27, 93), + # + (28, 16) +] +dataset_coco = dict( + type='CocoWholeBodyFaceDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw) + ], +) + +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_133kpt.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/lapa.py'), + datasets=[ + dataset_lapa, dataset_coco, dataset_wflw, dataset_300w, + dataset_cofw, dataset_halpe + ], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + pin_memory=True, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_test.json', + data_prefix=dict(img='pose/LaPa/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# test dataset +val_lapa = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_test.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[], +) + +val_coco = dict( + type='CocoWholeBodyFaceDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +val_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw) + ], +) + +val_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_test.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +val_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_test.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe) + ], +) + +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/lapa.py'), + datasets=[val_lapa, val_coco, val_wflw, val_300w, val_cofw, val_halpe], + pipeline=val_pipeline, + test_mode=True, + )) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='NME', rule='less', max_keep_ckpts=1, interval=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-t_8xb256-120e_face6-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-t_8xb256-120e_face6-256x256.py new file mode 100644 index 0000000..88e9517 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-t_8xb256-120e_face6-256x256.py @@ -0,0 +1,689 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# lapa coco wflw 300w cofw halpe + +# runtime +max_epochs = 120 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=1) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.005, + begin=30, + end=max_epochs, + T_max=90, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e-3a2dd350.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=106, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'LapaDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.2), + dict(type='MedianBlur', p=0.2), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +# train dataset +dataset_lapa = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[], +) + +kpt_68_to_106 = [ + # + (0, 0), + (1, 2), + (2, 4), + (3, 6), + (4, 8), + (5, 10), + (6, 12), + (7, 14), + (8, 16), + (9, 18), + (10, 20), + (11, 22), + (12, 24), + (13, 26), + (14, 28), + (15, 30), + (16, 32), + # + (17, 33), + (18, 34), + (19, 35), + (20, 36), + (21, 37), + # + (22, 42), + (23, 43), + (24, 44), + (25, 45), + (26, 46), + # + (27, 51), + (28, 52), + (29, 53), + (30, 54), + # + (31, 58), + (32, 59), + (33, 60), + (34, 61), + (35, 62), + # + (36, 66), + (39, 70), + # + ((37, 38), 68), + ((40, 41), 72), + # + (42, 75), + (45, 79), + # + ((43, 44), 77), + ((46, 47), 81), + # + (48, 84), + (49, 85), + (50, 86), + (51, 87), + (52, 88), + (53, 89), + (54, 90), + (55, 91), + (56, 92), + (57, 93), + (58, 94), + (59, 95), + (60, 96), + (61, 97), + (62, 98), + (63, 99), + (64, 100), + (65, 101), + (66, 102), + (67, 103) +] + +mapping_halpe = [ + # + (26, 0), + (27, 2), + (28, 4), + (29, 6), + (30, 8), + (31, 10), + (32, 12), + (33, 14), + (34, 16), + (35, 18), + (36, 20), + (37, 22), + (38, 24), + (39, 26), + (40, 28), + (41, 30), + (42, 32), + # + (43, 33), + (44, 34), + (45, 35), + (46, 36), + (47, 37), + # + (48, 42), + (49, 43), + (50, 44), + (51, 45), + (52, 46), + # + (53, 51), + (54, 52), + (55, 53), + (56, 54), + # + (57, 58), + (58, 59), + (59, 60), + (60, 61), + (61, 62), + # + (62, 66), + (65, 70), + # + ((63, 64), 68), + ((66, 67), 72), + # + (68, 75), + (71, 79), + # + ((69, 70), 77), + ((72, 73), 81), + # + (74, 84), + (75, 85), + (76, 86), + (77, 87), + (78, 88), + (79, 89), + (80, 90), + (81, 91), + (82, 92), + (83, 93), + (84, 94), + (85, 95), + (86, 96), + (87, 97), + (88, 98), + (89, 99), + (90, 100), + (91, 101), + (92, 102), + (93, 103) +] + +mapping_wflw = [ + # + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), + (17, 17), + (18, 18), + (19, 19), + (20, 20), + (21, 21), + (22, 22), + (23, 23), + (24, 24), + (25, 25), + (26, 26), + (27, 27), + (28, 28), + (29, 29), + (30, 30), + (31, 31), + (32, 32), + # + (33, 33), + (34, 34), + (35, 35), + (36, 36), + (37, 37), + (38, 38), + (39, 39), + (40, 40), + (41, 41), + # + (42, 42), + (43, 43), + (44, 44), + (45, 45), + (46, 46), + (47, 47), + (48, 48), + (49, 49), + (50, 50), + # + (51, 51), + (52, 52), + (53, 53), + (54, 54), + # + (55, 58), + (56, 59), + (57, 60), + (58, 61), + (59, 62), + # + (60, 66), + (61, 67), + (62, 68), + (63, 69), + (64, 70), + (65, 71), + (66, 72), + (67, 73), + # + (68, 75), + (69, 76), + (70, 77), + (71, 78), + (72, 79), + (73, 80), + (74, 81), + (75, 82), + # + (76, 84), + (77, 85), + (78, 86), + (79, 87), + (80, 88), + (81, 89), + (82, 90), + (83, 91), + (84, 92), + (85, 93), + (86, 94), + (87, 95), + (88, 96), + (89, 97), + (90, 98), + (91, 99), + (92, 100), + (93, 101), + (94, 102), + (95, 103), + # + (96, 104), + # + (97, 105) +] + +mapping_cofw = [ + # + (0, 33), + (2, 38), + (4, 35), + (5, 40), + # + (1, 46), + (3, 50), + (6, 44), + (7, 48), + # + (8, 60), + (10, 64), + (12, 62), + (13, 66), + # + (9, 72), + (11, 68), + (14, 70), + (15, 74), + # + (18, 57), + (19, 63), + (20, 54), + (21, 60), + # + (22, 84), + (23, 90), + (24, 87), + (25, 98), + (26, 102), + (27, 93), + # + (28, 16) +] +dataset_coco = dict( + type='CocoWholeBodyFaceDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw) + ], +) + +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_133kpt.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/lapa.py'), + datasets=[ + dataset_lapa, dataset_coco, dataset_wflw, dataset_300w, + dataset_cofw, dataset_halpe + ], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_test.json', + data_prefix=dict(img='pose/LaPa/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# test dataset +val_lapa = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_test.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[], +) + +val_coco = dict( + type='CocoWholeBodyFaceDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +val_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw) + ], +) + +val_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_test.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106) + ], +) + +val_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_test.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw) + ], +) + +val_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[ + dict( + type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe) + ], +) + +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/lapa.py'), + datasets=[val_lapa, val_coco, val_wflw, val_300w, val_cofw, val_halpe], + pipeline=val_pipeline, + test_mode=True, + )) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='NME', rule='less', max_keep_ckpts=1, interval=1)) + +custom_hooks = [ + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.md new file mode 100644 index 0000000..f8f3764 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.md @@ -0,0 +1,71 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +- Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset. +- `Face6` and `*` denote model trained on 6 public datasets: + - [COCO-Wholebody-Face](https://github.com/jin-s13/COCO-WholeBody/) + - [WFLW](https://wywu.github.io/projects/LAB/WFLW.html) + - [300W](https://ibug.doc.ic.ac.uk/resources/300-W/) + - [COFW](http://www.vision.caltech.edu/xpburgos/ICCV13/) + - [Halpe](https://github.com/Fang-Haoshu/Halpe-FullBody/) + - [LaPa](https://github.com/JDAI-CV/lapa-dataset) + +| Config | Input Size | NME
(LaPa) | FLOPS
(G) | Download | +| :--------------------------------------------------------------------------: | :--------: | :----------------: | :---------------: | :-----------------------------------------------------------------------------: | +| [RTMPose-t\*](./rtmpose/face_2d_keypoint/rtmpose-t_8xb256-120e_face6-256x256.py) | 256x256 | 1.67 | 0.652 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-face6_pt-in1k_120e-256x256-df79d9a5_20230529.pth) | +| [RTMPose-s\*](./rtmpose/face_2d_keypoint/rtmpose-s_8xb256-120e_face6-256x256.py) | 256x256 | 1.59 | 1.119 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-face6_pt-in1k_120e-256x256-d779fdef_20230529.pth) | +| [RTMPose-m\*](./rtmpose/face_2d_keypoint/rtmpose-m_8xb256-120e_face6-256x256.py) | 256x256 | 1.44 | 2.852 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-face6_pt-in1k_120e-256x256-72a37400_20230529.pth) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.yml b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.yml new file mode 100644 index 0000000..a9eb2fc --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.yml @@ -0,0 +1,51 @@ +Collections: +- Name: RTMPose + Paper: + Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose" + URL: https://arxiv.org/abs/2303.07399 + README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md +Models: +- Config: configs/face_2d_keypoint/rtmpose/face6/rtmpose-t_8xb256-120e_face6-256x256.py + In Collection: RTMPose + Metadata: + Architecture: &id001 + - RTMPose + Training Data: &id002 + - COCO-Wholebody-Face + - WFLW + - 300W + - COFW + - Halpe + - LaPa + Name: rtmpose-t_8xb256-120e_face6-256x256 + Results: + - Dataset: Face6 + Metrics: + NME: 1.67 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-face6_pt-in1k_120e-256x256-df79d9a5_20230529.pth +- Config: configs/face_2d_keypoint/rtmpose/face6/rtmpose-s_8xb256-120e_face6-256x256.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-s_8xb256-120e_face6-256x256 + Results: + - Dataset: Face6 + Metrics: + NME: 1.59 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-face6_pt-in1k_120e-256x256-d779fdef_20230529.pth +- Config: configs/face_2d_keypoint/rtmpose/face6/rtmpose-m_8xb256-120e_face6-256x256.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: *id002 + Name: rtmpose-m_8xb256-120e_face6-256x256 + Alias: face + Results: + - Dataset: Face6 + Metrics: + NME: 1.44 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-face6_pt-in1k_120e-256x256-72a37400_20230529.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py new file mode 100644 index 0000000..8e43b73 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py @@ -0,0 +1,246 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 120 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=1) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=106, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'LapaDataset' +data_mode = 'topdown' +data_root = 'data/LaPa/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/LaPa/', +# f'{data_root}': 's3://openmmlab/datasets/pose/LaPa/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.2), + dict(type='MedianBlur', p=0.2), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/lapa_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/lapa_val.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/lapa_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='NME', rule='less', max_keep_ckpts=1, interval=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.md new file mode 100644 index 0000000..837d3fd --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.md @@ -0,0 +1,40 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+LaPa (AAAI'2020) + +```bibtex +@inproceedings{liu2020new, + title={A New Dataset and Boundary-Attention Semantic Segmentation for Face Parsing.}, + author={Liu, Yinglu and Shi, Hailin and Shen, Hao and Si, Yue and Wang, Xiaobo and Mei, Tao}, + booktitle={AAAI}, + pages={11637--11644}, + year={2020} +} +``` + +
+ +Results on LaPa val set + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :------------------------------------------------------------: | +| [pose_rtmpose_m](/configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py) | 256x256 | 1.29 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-lapa_pt-aic-coco_120e-256x256-762b1ae2_20230422.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-lapa_pt-aic-coco_120e-256x256-762b1ae2_20230422.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.yml b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.yml new file mode 100644 index 0000000..9f4cf04 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.yml @@ -0,0 +1,15 @@ +Models: +- Config: configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py + In Collection: RTMPose + Alias: face + Metadata: + Architecture: + - RTMPose + Training Data: LaPa + Name: rtmpose-m_8xb64-120e_lapa-256x256 + Results: + - Dataset: WFLW + Metrics: + NME: 1.29 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-lapa_pt-aic-coco_120e-256x256-762b1ae2_20230422.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py new file mode 100644 index 0000000..833235d --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py @@ -0,0 +1,231 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 60 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=1) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=98, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'WFLWDataset' +data_mode = 'topdown' +data_root = 'data/wflw/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/WFLW/', +# f'{data_root}': 's3://openmmlab/datasets/pose/WFLW/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='NME', rule='less', max_keep_ckpts=1, interval=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.md new file mode 100644 index 0000000..30554f7 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.md @@ -0,0 +1,42 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+WFLW (CVPR'2018) + +```bibtex +@inproceedings{wu2018look, + title={Look at boundary: A boundary-aware face alignment algorithm}, + author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={2129--2138}, + year={2018} +} +``` + +
+ +Results on WFLW dataset + +The model is trained on WFLW train. + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :------------------------------------------------------------: | +| [pose_rtmpose_m](/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py) | 256x256 | 4.01 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-wflw_pt-aic-coco_60e-256x256-dc1dcdcf_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-wflw_pt-aic-coco_60e-256x256-dc1dcdcf_20230228.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.yml new file mode 100644 index 0000000..7ec6a7f --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.yml @@ -0,0 +1,14 @@ +Models: +- Config: configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py + In Collection: RTMPose + Metadata: + Architecture: + - RTMPose + Training Data: WFLW + Name: rtmpose-m_8xb64-60e_wflw-256x256 + Results: + - Dataset: WFLW + Metrics: + NME: 4.01 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-wflw_pt-aic-coco_60e-256x256-dc1dcdcf_20230228.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.md new file mode 100644 index 0000000..8da5476 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.md @@ -0,0 +1,44 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+300W (IMAVIS'2016) + +```bibtex +@article{sagonas2016300, + title={300 faces in-the-wild challenge: Database and results}, + author={Sagonas, Christos and Antonakos, Epameinondas and Tzimiropoulos, Georgios and Zafeiriou, Stefanos and Pantic, Maja}, + journal={Image and vision computing}, + volume={47}, + pages={3--18}, + year={2016}, + publisher={Elsevier} +} +``` + +
+ +Results on 300W dataset + +The model is trained on 300W train. + +| Arch | Input Size | NME*common* | NME*challenge* | NME*full* | NME*test* | ckpt | log | +| :--------------------------------- | :--------: | :--------------------: | :-----------------------: | :------------------: | :------------------: | :---------------------------------: | :--------------------------------: | +| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py) | 256x256 | 2.92 | 5.64 | 3.45 | 4.10 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_300w_256x256-eea53406_20211019.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_300w_256x256_20211019.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.yml new file mode 100644 index 0000000..4a813d1 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.yml @@ -0,0 +1,23 @@ +Collections: +- Name: HRNetv2 + Paper: + Title: Deep High-Resolution Representation Learning for Visual Recognition + URL: https://ieeexplore.ieee.org/abstract/document/9052469/ + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrnetv2.md +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + Training Data: 300W + Name: td-hm_hrnetv2-w18_8xb64-60e_300w-256x256 + Results: + - Dataset: 300W + Metrics: + NME challenge: 5.64 + NME common: 2.92 + NME full: 3.45 + NME test: 4.1 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_300w_256x256-eea53406_20211019.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py new file mode 100644 index 0000000..7f279f0 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py @@ -0,0 +1,161 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=60, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=1.5) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=68, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'Face300WDataset' +data_mode = 'topdown' +data_root = 'data/300w/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_300w_valid.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.md new file mode 100644 index 0000000..02c4bb7 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.md @@ -0,0 +1,42 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+300WLP (IEEE'2017) + +```bibtex +@article{zhu2017face, + title={Face alignment in full pose range: A 3d total solution}, + author={Zhu, Xiangyu and Liu, Xiaoming and Lei, Zhen and Li, Stan Z}, + journal={IEEE transactions on pattern analysis and machine intelligence}, + year={2017}, + publisher={IEEE} +} +``` + +
+ +Results on 300W-LP dataset + +The model is trained on 300W-LP train. + +| Arch | Input Size | NME*full* | NME*test* | ckpt | log | +| :------------------------------------------------- | :--------: | :------------------: | :------------------: | :------------------------------------------------: | :------------------------------------------------: | +| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py) | 256x256 | 0.0413 | 0.04125 | [ckpt](https://download.openmmlab.com/mmpose/v1/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_w18_300wlp_256x256-fb433d21_20230922.pth) | [log](https://download.openmmlab.com/mmpose/v1/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_w18_300wlp_256x256-fb433d21_20230922.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.yml new file mode 100644 index 0000000..8a63761 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.yml @@ -0,0 +1,20 @@ +Collections: +- Name: HRNetv2 + Paper: + Title: Deep High-Resolution Representation Learning for Visual Recognition + URL: https://ieeexplore.ieee.org/abstract/document/9052469/ + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrnetv2.md +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + Training Data: 300W-LP + Name: td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256 + Results: + - Dataset: 300W-LP + Metrics: + NME full: 0.0413 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_w18_300wlp_256x256-fb433d21_20230922.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py new file mode 100644 index 0000000..b4b5f61 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py @@ -0,0 +1,160 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=60, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=1.5) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=68, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'Face300WLPDataset' +data_mode = 'topdown' +data_root = 'data/300wlp/' +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_300wlp_train.json', + data_prefix=dict(img='train/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_300wlp_valid.json', + data_prefix=dict(img='val/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/README.md new file mode 100644 index 0000000..53fd36d --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/README.md @@ -0,0 +1,57 @@ +# Top-down heatmap-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html). + +
+ +
+ +## Results and Models + +### 300W Dataset + +Results on 300W dataset + +| Model | Input Size | NME*common* | NME*challenge* | NME*full* | NME*test* | Details and Download | +| :---------: | :--------: | :--------------------: | :-----------------------: | :------------------: | :------------------: | :---------------------------------------: | +| HRNetv2-w18 | 256x256 | 2.92 | 5.64 | 3.45 | 4.10 | [hrnetv2_300w.md](./300w/hrnetv2_300w.md) | + +### AFLW Dataset + +Results on AFLW dataset + +| Model | Input Size | NME*full* | NME*frontal* | Details and Download | +| :--------------: | :--------: | :------------------: | :---------------------: | :-------------------------------------------------: | +| HRNetv2-w18+Dark | 256x256 | 1.35 | 1.19 | [hrnetv2_dark_aflw.md](./aflw/hrnetv2_dark_aflw.md) | +| HRNetv2-w18 | 256x256 | 1.41 | 1.27 | [hrnetv2_aflw.md](./aflw/hrnetv2_aflw.md) | + +### COCO-WholeBody-Face Dataset + +Results on COCO-WholeBody-Face val set + +| Model | Input Size | NME | Details and Download | +| :--------------: | :--------: | :----: | :----------------------------------------------------------------------------------------------: | +| HRNetv2-w18+Dark | 256x256 | 0.0513 | [hrnetv2_dark_coco_wholebody_face.md](./coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md) | +| SCNet-50 | 256x256 | 0.0567 | [scnet_coco_wholebody_face.md](./coco_wholebody_face/scnet_coco_wholebody_face.md) | +| HRNetv2-w18 | 256x256 | 0.0569 | [hrnetv2_coco_wholebody_face.md](./coco_wholebody_face/hrnetv2_coco_wholebody_face.md) | +| ResNet-50 | 256x256 | 0.0582 | [resnet_coco_wholebody_face.md](./coco_wholebody_face/resnet_coco_wholebody_face.md) | +| HourglassNet | 256x256 | 0.0587 | [hourglass_coco_wholebody_face.md](./coco_wholebody_face/hourglass_coco_wholebody_face.md) | +| MobileNet-v2 | 256x256 | 0.0611 | [mobilenetv2_coco_wholebody_face.md](./coco_wholebody_face/mobilenetv2_coco_wholebody_face.md) | + +### COFW Dataset + +Results on COFW dataset + +| Model | Input Size | NME | Details and Download | +| :---------: | :--------: | :--: | :---------------------------------------: | +| HRNetv2-w18 | 256x256 | 3.48 | [hrnetv2_cofw.md](./cofw/hrnetv2_cofw.md) | + +### WFLW Dataset + +Results on WFLW dataset + +| Model | Input Size | NME*test* | NME*pose* | NME*illumination* | NME*occlusion* | NME*blur* | NME*makeup* | NME*expression* | Details and Download | +| :-----: | :--------: | :------------------: | :------------------: | :--------------------------: | :-----------------------: | :------------------: | :--------------------: | :------------------------: | :--------------------: | +| HRNetv2-w18+Dark | 256x256 | 3.98 | 6.98 | 3.96 | 4.78 | 4.56 | 3.89 | 4.29 | [hrnetv2_dark_wflw.md](./wflw/hrnetv2_dark_wflw.md) | +| HRNetv2-w18+AWing | 256x256 | 4.02 | 6.94 | 3.97 | 4.78 | 4.59 | 3.87 | 4.28 | [hrnetv2_awing_wflw.md](./wflw/hrnetv2_awing_wflw.md) | +| HRNetv2-w18 | 256x256 | 4.06 | 6.97 | 3.99 | 4.83 | 4.58 | 3.94 | 4.33 | [hrnetv2_wflw.md](./wflw/hrnetv2_wflw.md) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.md new file mode 100644 index 0000000..36aade2 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.md @@ -0,0 +1,43 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+AFLW (ICCVW'2011) + +```bibtex +@inproceedings{koestinger2011annotated, + title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization}, + author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst}, + booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)}, + pages={2144--2151}, + year={2011}, + organization={IEEE} +} +``` + +
+ +Results on AFLW dataset + +The model is trained on AFLW train and evaluated on AFLW full and frontal. + +| Arch | Input Size | NME*full* | NME*frontal* | ckpt | log | +| :------------------------------------------------ | :--------: | :------------------: | :---------------------: | :-----------------------------------------------: | :-----------------------------------------------: | +| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py) | 256x256 | 1.41 | 1.27 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256_20210125.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.yml new file mode 100644 index 0000000..ce0bdcc --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.yml @@ -0,0 +1,15 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + Training Data: AFLW + Name: td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256 + Results: + - Dataset: AFLW + Metrics: + NME frontal: 1.27 + NME full: 1.41 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.md new file mode 100644 index 0000000..fc4f25e --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.md @@ -0,0 +1,60 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+AFLW (ICCVW'2011) + +```bibtex +@inproceedings{koestinger2011annotated, + title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization}, + author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst}, + booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)}, + pages={2144--2151}, + year={2011}, + organization={IEEE} +} +``` + +
+ +Results on AFLW dataset + +The model is trained on AFLW train and evaluated on AFLW full and frontal. + +| Arch | Input Size | NME*full* | NME*frontal* | ckpt | log | +| :------------------------------------------------ | :--------: | :------------------: | :---------------------: | :-----------------------------------------------: | :-----------------------------------------------: | +| [pose_hrnetv2_w18_dark](/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py) | 256x256 | 1.35 | 1.19 | [ckpt](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_aflw_256x256_dark-219606c0_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_aflw_256x256_dark_20210125.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml new file mode 100644 index 0000000..955adb6 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py + In Collection: DarkPose + Metadata: + Architecture: + - HRNetv2 + - DarkPose + Training Data: AFLW + Name: td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256 + Results: + - Dataset: AFLW + Metrics: + NME frontal: 1.19 + NME full: 1.34 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_aflw_256x256_dark-219606c0_20210125.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py new file mode 100644 index 0000000..50d197b --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py @@ -0,0 +1,156 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=60, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=19, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AFLWDataset' +data_mode = 'topdown' +data_root = 'data/aflw/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_aflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_aflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', norm_mode='use_norm_item', norm_item='bbox_size') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py new file mode 100644 index 0000000..335cd34 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py @@ -0,0 +1,160 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=60, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=19, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'AFLWDataset' +data_mode = 'topdown' +data_root = 'data/aflw/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_aflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_aflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', norm_mode='use_norm_item', norm_item='bbox_size') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.md new file mode 100644 index 0000000..26f08da --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.md @@ -0,0 +1,39 @@ + + +
+Hourglass (ECCV'2016) + +```bibtex +@inproceedings{newell2016stacked, + title={Stacked hourglass networks for human pose estimation}, + author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia}, + booktitle={European conference on computer vision}, + pages={483--499}, + year={2016}, + organization={Springer} +} +``` + +
+ + + +
+COCO-WholeBody-Face (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Face val set + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [pose_hourglass_52](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0587 | [ckpt](https://download.openmmlab.com/mmpose/face/hourglass/hourglass52_coco_wholebody_face_256x256-6994cf2e_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/hourglass/hourglass52_coco_wholebody_face_256x256_20210909.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml new file mode 100644 index 0000000..185474f --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml @@ -0,0 +1,14 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py + In Collection: Hourglass + Metadata: + Architecture: + - Hourglass + Training Data: COCO-WholeBody-Face + Name: td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256 + Results: + - Dataset: COCO-WholeBody-Face + Metrics: + NME: 0.0587 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/hourglass/hourglass52_coco_wholebody_face_256x256-6994cf2e_20210909.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.md new file mode 100644 index 0000000..3cf9109 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.md @@ -0,0 +1,39 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+COCO-WholeBody-Face (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Face val set + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0569 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_coco_wholebody_face_256x256-c1ca469b_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_coco_wholebody_face_256x256_20210909.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml new file mode 100644 index 0000000..e7e526d --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml @@ -0,0 +1,14 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + Training Data: COCO-WholeBody-Face + Name: td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256 + Results: + - Dataset: COCO-WholeBody-Face + Metrics: + NME: 0.0569 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_coco_wholebody_face_256x256-c1ca469b_20210909.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md new file mode 100644 index 0000000..60914db --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md @@ -0,0 +1,56 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+COCO-WholeBody-Face (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Face val set + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [pose_hrnetv2_w18_dark](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0513 | [ckpt](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_coco_wholebody_face_256x256_dark-3d9a334e_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_coco_wholebody_face_256x256_dark_20210909.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml new file mode 100644 index 0000000..ca0cefd --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml @@ -0,0 +1,15 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py + In Collection: DarkPose + Metadata: + Architecture: + - HRNetv2 + - DarkPose + Training Data: COCO-WholeBody-Face + Name: td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256 + Results: + - Dataset: COCO-WholeBody-Face + Metrics: + NME: 0.0513 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_coco_wholebody_face_256x256_dark-3d9a334e_20210909.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.md new file mode 100644 index 0000000..a520407 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.md @@ -0,0 +1,38 @@ + + +
+MobilenetV2 (CVPR'2018) + +```bibtex +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +
+ + + +
+COCO-WholeBody-Face (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Face val set + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [pose_mobilenetv2](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0611 | [ckpt](https://download.openmmlab.com/mmpose/face/mobilenetv2/mobilenetv2_coco_wholebody_face_256x256-4a3f096e_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/mobilenetv2/mobilenetv2_coco_wholebody_face_256x256_20210909.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml new file mode 100644 index 0000000..6d80729 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml @@ -0,0 +1,15 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - MobilenetV2 + Training Data: COCO-WholeBody-Face + Name: td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256 + Results: + - Dataset: COCO-WholeBody-Face + Metrics: + NME: 0.0611 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/mobilenetv2/mobilenetv2_coco_wholebody_face_256x256-4a3f096e_20210909.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.md new file mode 100644 index 0000000..296588c --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.md @@ -0,0 +1,55 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO-WholeBody-Face (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Face val set + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [pose_res50](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0582 | [ckpt](https://download.openmmlab.com/mmpose/face/resnet/res50_coco_wholebody_face_256x256-5128edf5_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/resnet/res50_coco_wholebody_face_256x256_20210909.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml new file mode 100644 index 0000000..c63e04b --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml @@ -0,0 +1,15 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - ResNet + Training Data: COCO-WholeBody-Face + Name: td-hm_res50_8xb32-60e_coco-wholebody-face-256x256 + Results: + - Dataset: COCO-WholeBody-Face + Metrics: + NME: 0.0582 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/resnet/res50_coco_wholebody_face_256x256-5128edf5_20210909.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.md new file mode 100644 index 0000000..368b16b --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.md @@ -0,0 +1,38 @@ + + +
+SCNet (CVPR'2020) + +```bibtex +@inproceedings{liu2020improving, + title={Improving Convolutional Networks with Self-Calibrated Convolutions}, + author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={10096--10105}, + year={2020} +} +``` + +
+ + + +
+COCO-WholeBody-Face (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Face val set + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [pose_scnet_50](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0567 | [ckpt](https://download.openmmlab.com/mmpose/face/scnet/scnet50_coco_wholebody_face_256x256-a0183f5f_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/scnet/scnet50_coco_wholebody_face_256x256_20210909.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml new file mode 100644 index 0000000..d0fde1e --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml @@ -0,0 +1,15 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - SCNet + Training Data: COCO-WholeBody-Face + Name: td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256 + Results: + - Dataset: COCO-WholeBody-Face + Metrics: + NME: 0.0567 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/scnet/scnet50_coco_wholebody_face_256x256-a0183f5f_20210909.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py new file mode 100644 index 0000000..135a45f --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py @@ -0,0 +1,123 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HourglassNet', + num_stacks=1, + ), + head=dict( + type='CPMHead', + in_channels=256, + out_channels=68, + num_stages=1, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyFaceDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py new file mode 100644 index 0000000..b751fae --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py @@ -0,0 +1,156 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=68, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyFaceDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py new file mode 100644 index 0000000..a31e599 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py @@ -0,0 +1,160 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=68, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyFaceDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py new file mode 100644 index 0000000..c4a314d --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MobileNetV2', + widen_factor=1., + out_indices=(7, ), + init_cfg=dict(type='Pretrained', checkpoint='mmcls://mobilenet_v2')), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=68, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyFaceDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py new file mode 100644 index 0000000..7b4dcad --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=68, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyFaceDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py new file mode 100644 index 0000000..62b7885 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SCNet', + depth=50, + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/scnet50-7ef0a199.pth')), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=68, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyFaceDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.md new file mode 100644 index 0000000..4828f2c --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.md @@ -0,0 +1,42 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+COFW (ICCV'2013) + +```bibtex +@inproceedings{burgos2013robust, + title={Robust face landmark estimation under occlusion}, + author={Burgos-Artizzu, Xavier P and Perona, Pietro and Doll{\'a}r, Piotr}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + pages={1513--1520}, + year={2013} +} +``` + +
+ +Results on COFW dataset + +The model is trained on COFW train. + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :------------------------------------------------------------: | +| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py) | 256x256 | 3.48 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_cofw_256x256-49243ab8_20211019.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_cofw_256x256_20211019.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.yml new file mode 100644 index 0000000..749e348 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.yml @@ -0,0 +1,14 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + Training Data: COFW + Name: td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256 + Results: + - Dataset: COFW + Metrics: + NME: 3.48 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_cofw_256x256-49243ab8_20211019.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py new file mode 100644 index 0000000..ee59f3d --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py @@ -0,0 +1,161 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=50, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=1.5) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=29, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'COFWDataset' +data_mode = 'topdown' +data_root = 'data/cofw/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/cofw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/cofw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.md new file mode 100644 index 0000000..4df239a --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.md @@ -0,0 +1,59 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+AdaptiveWingloss (ICCV'2019) + +```bibtex +@inproceedings{wang2019adaptive, + title={Adaptive wing loss for robust face alignment via heatmap regression}, + author={Wang, Xinyao and Bo, Liefeng and Fuxin, Li}, + booktitle={Proceedings of the IEEE/CVF international conference on computer vision}, + pages={6971--6981}, + year={2019} +} +``` + +
+ + + +
+WFLW (CVPR'2018) + +```bibtex +@inproceedings{wu2018look, + title={Look at boundary: A boundary-aware face alignment algorithm}, + author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={2129--2138}, + year={2018} +} +``` + +
+ +Results on WFLW dataset + +The model is trained on WFLW train. + +| Arch | Input Size | NME*test* | NME*pose* | NME*illumination* | NME*occlusion* | NME*blur* | NME*makeup* | NME*expression* | ckpt | log | +| :--------- | :--------: | :------------------: | :------------------: | :--------------------------: | :-----------------------: | :------------------: | :--------------------: | :------------------------: | :--------: | :-------: | +| [pose_hrnetv2_w18_awing](/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py) | 256x256 | 4.02 | 6.94 | 3.97 | 4.78 | 4.59 | 3.87 | 4.28 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_awing-5af5055c_20211212.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_awing_20211212.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml new file mode 100644 index 0000000..6a6d46a --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml @@ -0,0 +1,21 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + - AdaptiveWingloss + Training Data: WFLW + Name: td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256 + Results: + - Dataset: WFLW + Metrics: + NME blur: 4.59 + NME expression: 4.28 + NME illumination: 3.97 + NME makeup: 3.87 + NME occlusion: 4.78 + NME pose: 6.94 + NME test: 4.02 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_awing-5af5055c_20211212.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.md new file mode 100644 index 0000000..b36477b --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.md @@ -0,0 +1,59 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+WFLW (CVPR'2018) + +```bibtex +@inproceedings{wu2018look, + title={Look at boundary: A boundary-aware face alignment algorithm}, + author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={2129--2138}, + year={2018} +} +``` + +
+ +Results on WFLW dataset + +The model is trained on WFLW train. + +| Arch | Input Size | NME*test* | NME*pose* | NME*illumination* | NME*occlusion* | NME*blur* | NME*makeup* | NME*expression* | ckpt | log | +| :--------- | :--------: | :------------------: | :------------------: | :--------------------------: | :-----------------------: | :------------------: | :--------------------: | :------------------------: | :--------: | :-------: | +| [pose_hrnetv2_w18_dark](/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py) | 256x256 | 3.98 | 6.98 | 3.96 | 4.78 | 4.56 | 3.89 | 4.29 | [ckpt](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_wflw_256x256_dark-3f8e0c2c_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_wflw_256x256_dark_20210125.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml new file mode 100644 index 0000000..303be33 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml @@ -0,0 +1,21 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py + In Collection: DarkPose + Metadata: + Architecture: + - HRNetv2 + - DarkPose + Training Data: WFLW + Name: td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256 + Results: + - Dataset: WFLW + Metrics: + NME blur: 4.56 + NME expression: 4.29 + NME illumination: 3.96 + NME makeup: 3.89 + NME occlusion: 4.78 + NME pose: 6.98 + NME test: 3.98 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_wflw_256x256_dark-3f8e0c2c_20210125.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.md new file mode 100644 index 0000000..121f993 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.md @@ -0,0 +1,42 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+WFLW (CVPR'2018) + +```bibtex +@inproceedings{wu2018look, + title={Look at boundary: A boundary-aware face alignment algorithm}, + author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={2129--2138}, + year={2018} +} +``` + +
+ +Results on WFLW dataset + +The model is trained on WFLW train. + +| Arch | Input Size | NME*test* | NME*pose* | NME*illumination* | NME*occlusion* | NME*blur* | NME*makeup* | NME*expression* | ckpt | log | +| :--------- | :--------: | :------------------: | :------------------: | :--------------------------: | :-----------------------: | :------------------: | :--------------------: | :------------------------: | :--------: | :-------: | +| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py) | 256x256 | 4.06 | 6.97 | 3.99 | 4.83 | 4.58 | 3.94 | 4.33 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256-2bf032a6_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_20210125.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml new file mode 100644 index 0000000..2d188c3 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml @@ -0,0 +1,20 @@ +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + Training Data: WFLW + Name: td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256 + Results: + - Dataset: WFLW + Metrics: + NME blur: 4.58 + NME expression: 4.33 + NME illumination: 3.99 + NME makeup: 3.94 + NME occlusion: 4.83 + NME pose: 6.97 + NME test: 4.06 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256-2bf032a6_20210125.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py new file mode 100644 index 0000000..507035c --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py @@ -0,0 +1,158 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=60, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=98, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'WFLWDataset' +data_mode = 'topdown' +data_root = 'data/wflw/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py new file mode 100644 index 0000000..f6885da --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py @@ -0,0 +1,158 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=60, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=98, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='AdaptiveWingLoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'WFLWDataset' +data_mode = 'topdown' +data_root = 'data/wflw/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py new file mode 100644 index 0000000..e1a47c7 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py @@ -0,0 +1,162 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=60, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=2e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=60, + milestones=[40, 55], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'), + ), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=98, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'WFLWDataset' +data_mode = 'topdown' +data_root = 'data/wflw/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_prob=0, + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/README.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/README.md new file mode 100644 index 0000000..c4b1cb4 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/README.md @@ -0,0 +1,19 @@ +# Top-down regression-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, regression based methods directly regress the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [Deeppose: Human pose estimation via deep neural networks](http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html). + +
+ +
+ +## Results and Models + +### WFLW Dataset + +Result on WFLW test set + +| Model | Input Size | NME | ckpt | log | +| :-------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [ResNet-50](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py) | 256x256 | 4.88 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256-92d0ba7f_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_20210303.log.json) | +| [ResNet-50+WingLoss](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py) | 256x256 | 4.67 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss-f82a5e53_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss_20210303.log.json) | +| [ResNet-50+SoftWingLoss](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py) | 256x256 | 4.44 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss-4d34f22a_20211212.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss_20211212.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.md new file mode 100644 index 0000000..f36b939 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.md @@ -0,0 +1,75 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+SoftWingloss (TIP'2021) + +```bibtex +@article{lin2021structure, + title={Structure-Coherent Deep Feature Learning for Robust Face Alignment}, + author={Lin, Chunze and Zhu, Beier and Wang, Quan and Liao, Renjie and Qian, Chen and Lu, Jiwen and Zhou, Jie}, + journal={IEEE Transactions on Image Processing}, + year={2021}, + publisher={IEEE} +} +``` + +
+ + + +
+WFLW (CVPR'2018) + +```bibtex +@inproceedings{wu2018look, + title={Look at boundary: A boundary-aware face alignment algorithm}, + author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={2129--2138}, + year={2018} +} +``` + +
+ +Results on WFLW dataset + +The model is trained on WFLW train set. + +| Model | Input Size | NME | ckpt | log | +| :-------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [ResNet-50+SoftWingLoss](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py) | 256x256 | 4.44 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss-4d34f22a_20211212.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss_20211212.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.yml new file mode 100644 index 0000000..9e33fe2 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.yml @@ -0,0 +1,22 @@ +Collections: +- Name: SoftWingloss + Paper: + Title: Structure-Coherent Deep Feature Learning for Robust Face Alignment + URL: https://ieeexplore.ieee.org/document/9442331/ + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/softwingloss.md +Models: +- Config: configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py + In Collection: SoftWingloss + Metadata: + Architecture: + - DeepPose + - ResNet + - SoftWingloss + Training Data: WFLW + Name: td-reg_res50_softwingloss_8xb64-210e_wflw-256x256 + Results: + - Dataset: WFLW + Metrics: + NME: 4.44 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss-4d34f22a_20211212.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.md new file mode 100644 index 0000000..f605688 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.md @@ -0,0 +1,58 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+WFLW (CVPR'2018) + +```bibtex +@inproceedings{wu2018look, + title={Look at boundary: A boundary-aware face alignment algorithm}, + author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={2129--2138}, + year={2018} +} +``` + +
+ +Results on WFLW dataset + +The model is trained on WFLW train set. + +| Model | Input Size | NME | ckpt | log | +| :-------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [ResNet-50](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py) | 256x256 | 4.88 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256-92d0ba7f_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_20210303.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.yml new file mode 100644 index 0000000..c86c208 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.yml @@ -0,0 +1,21 @@ +Collections: +- Name: ResNet + Paper: + Title: Deep residual learning for image recognition + URL: http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/resnet.md +Models: +- Config: configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py + In Collection: ResNet + Metadata: + Architecture: + - DeepPose + - ResNet + Training Data: WFLW + Name: td-reg_res50_8x64e-210e_wflw-256x256 + Results: + - Dataset: WFLW + Metrics: + NME: 4.88 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256-92d0ba7f_20210303.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.md new file mode 100644 index 0000000..5dc9adc --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.md @@ -0,0 +1,76 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+Wingloss (CVPR'2018) + +```bibtex +@inproceedings{feng2018wing, + title={Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural Networks}, + author={Feng, Zhen-Hua and Kittler, Josef and Awais, Muhammad and Huber, Patrik and Wu, Xiao-Jun}, + booktitle={Computer Vision and Pattern Recognition (CVPR), 2018 IEEE Conference on}, + year={2018}, + pages ={2235-2245}, + organization={IEEE} +} +``` + +
+ + + +
+WFLW (CVPR'2018) + +```bibtex +@inproceedings{wu2018look, + title={Look at boundary: A boundary-aware face alignment algorithm}, + author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={2129--2138}, + year={2018} +} +``` + +
+ +Results on WFLW dataset + +The model is trained on WFLW train set. + +| Model | Input Size | NME | ckpt | log | +| :-------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [ResNet-50+WingLoss](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py) | 256x256 | 4.67 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss-f82a5e53_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss_20210303.log.json) | diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.yml new file mode 100644 index 0000000..327f07e --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.yml @@ -0,0 +1,23 @@ +Collections: +- Name: Wingloss + Paper: + Title: Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural + Networks + URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Feng_Wing_Loss_for_CVPR_2018_paper.html + README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/wingloss.md +Models: +- Config: configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py + In Collection: Wingloss + Metadata: + Architecture: + - DeepPose + - ResNet + - WingLoss + Training Data: WFLW + Name: td-reg_res50_wingloss_8xb64-210e_wflw-256x256 + Results: + - Dataset: WFLW + Metrics: + NME: 4.67 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss-f82a5e53_20210303.pth diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py new file mode 100644 index 0000000..dd9ade7 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(256, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=98, + loss=dict(type='SmoothL1Loss', use_target_weight=True), + decoder=codec), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'WFLWDataset' +data_mode = 'topdown' +data_root = 'data/wflw/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# dataloaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less')) + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py new file mode 100644 index 0000000..beae1bf --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(256, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=98, + loss=dict(type='SoftWingLoss', use_target_weight=True), + decoder=codec), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'WFLWDataset' +data_mode = 'topdown' +data_root = 'data/wflw/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# dataloaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less')) + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py new file mode 100644 index 0000000..2f625e6 --- /dev/null +++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(256, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=98, + loss=dict(type='WingLoss', use_target_weight=True), + decoder=codec), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + shift_coords=True, + )) + +# base dataset settings +dataset_type = 'WFLWDataset' +data_mode = 'topdown' +data_root = 'data/wflw/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# dataloaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less')) + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/README.md b/modules/rtmpose/configs/fashion_2d_keypoint/README.md new file mode 100644 index 0000000..f4ec40f --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/README.md @@ -0,0 +1,7 @@ +# 2D Fashion Landmark Detection + +2D fashion landmark detection (also referred to as fashion alignment) aims to detect the key-point located at the functional region of clothes, for example the neckline and the cuff. + +## Data preparation + +Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_fashion_landmark.md) to prepare data. diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/README.md new file mode 100644 index 0000000..94e47b7 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/README.md @@ -0,0 +1,42 @@ +# Top-down heatmap-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html). + +
+ +
+ +## Results and Models + +### DeepFashion Dataset + +Results on DeepFashion dataset with ResNet backbones: + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :-----------------: | :--------: | :-----: | :--: | :--: | :----------------------------------------------------------: | +| HRNet-w48-UDP-Upper | 256x192 | 96.1 | 60.9 | 15.1 | [hrnet_deepfashion.md](./deepfashion/hrnet_deepfashion.md) | +| HRNet-w48-UDP-Lower | 256x192 | 97.8 | 76.1 | 8.9 | [hrnet_deepfashion.md](./deepfashion/hrnet_deepfashion.md) | +| HRNet-w48-UDP-Full | 256x192 | 98.3 | 67.3 | 11.7 | [hrnet_deepfashion.md](./deepfashion/hrnet_deepfashion.md) | +| ResNet-50-Upper | 256x192 | 95.4 | 57.8 | 16.8 | [resnet_deepfashion.md](./deepfashion/resnet_deepfashion.md) | +| ResNet-50-Lower | 256x192 | 96.5 | 74.4 | 10.5 | [resnet_deepfashion.md](./deepfashion/resnet_deepfashion.md) | +| ResNet-50-Full | 256x192 | 97.7 | 66.4 | 12.7 | [resnet_deepfashion.md](./deepfashion/resnet_deepfashion.md) | + +### DeepFashion2 Dataset + +Results on DeepFashion2 dataset + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :-----------------------------: | :--------: | :-----: | :---: | :--: | :-----------------------------------------------------------: | +| ResNet-50-Short-Sleeved-Shirt | 256x192 | 0.988 | 0.703 | 10.2 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Long-Sleeved-Shirt | 256x192 | 0.973 | 0.587 | 16.6 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Short-Sleeved-Outwear | 256x192 | 0.966 | 0.408 | 24.0 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Long-Sleeved-Outwear | 256x192 | 0.987 | 0.517 | 18.1 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Vest | 256x192 | 0.981 | 0.643 | 12.7 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Sling | 256x192 | 0.940 | 0.557 | 21.6 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Shorts | 256x192 | 0.975 | 0.682 | 12.4 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Trousers | 256x192 | 0.973 | 0.625 | 14.8 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Skirt | 256x192 | 0.952 | 0.653 | 16.6 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Short-Sleeved-Dress | 256x192 | 0.980 | 0.603 | 15.6 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Long-Sleeved-Dress | 256x192 | 0.976 | 0.518 | 20.1 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Vest-Dress | 256x192 | 0.980 | 0.600 | 16.0 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | +| ResNet-50-Sling-Dress | 256x192 | 0.967 | 0.544 | 19.5 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) | diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.md b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.md new file mode 100644 index 0000000..ece7c38 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.md @@ -0,0 +1,77 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+DeepFashion (CVPR'2016) + +```bibtex +@inproceedings{liuLQWTcvpr16DeepFashion, + author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou}, + title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations}, + booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2016} +} +``` + +
+ + + +
+DeepFashion (ECCV'2016) + +```bibtex +@inproceedings{liuYLWTeccv16FashionLandmark, + author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou}, + title = {Fashion Landmark Detection in the Wild}, + booktitle = {European Conference on Computer Vision (ECCV)}, + month = {October}, + year = {2016} + } +``` + +
+ +Results on DeepFashion val set + +| Set | Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :---- | :-------------------------------------------------------: | :--------: | :-----: | :--: | :--: | :-------------------------------------------------------: | :------------------------------------------------------: | +| upper | [pose_hrnet_w48_udp](td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_uppder-256x192.py) | 256x192 | 96.1 | 60.9 | 15.1 | [ckpt](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192-de7c0eb1_20230810.pth) | [log](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192-de7c0eb1_20230810.log) | +| lower | [pose_hrnet_w48_udp](td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py) | 256x192 | 97.8 | 76.1 | 8.9 | [ckpt](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192-ddaf747d_20230810.pth) | [log](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192-ddaf747d_20230810.log) | +| full | [pose_hrnet_w48_udp](td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py) | 256x192 | 98.3 | 67.3 | 11.7 | [ckpt](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192-7ab504c7_20230810.pth) | [log](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192-7ab504c7_20230810.log) | + +Note: Due to the time constraints, we have only trained resnet50 models. We warmly welcome any contributions if you can successfully reproduce the results from the paper! diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.yml b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.yml new file mode 100644 index 0000000..ccca1d3 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.yml @@ -0,0 +1,45 @@ +Models: +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py + In Collection: UDP + Metadata: + Architecture: &id001 + - HRNet + - UDP + Training Data: DeepFashion + Name: td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192 + Results: + - Dataset: DeepFashion + Metrics: + AUC: 76.1 + EPE: 8.9 + PCK@0.2: 97.8 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192-ddaf747d_20230810.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: DeepFashion + Name: td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192 + Results: + - Dataset: DeepFashion + Metrics: + AUC: 60.9 + EPE: 15.1 + PCK@0.2: 96.1 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192-de7c0eb1_20230810.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py + In Collection: UDP + Metadata: + Architecture: *id001 + Training Data: DeepFashion + Name: td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192 + Results: + - Dataset: DeepFashion + Metrics: + AUC: 67.3 + EPE: 11.7 + PCK@0.2: 98.3 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192-7ab504c7_20230810.pth diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.md b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.md new file mode 100644 index 0000000..4475ebc --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.md @@ -0,0 +1,77 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+DeepFashion (CVPR'2016) + +```bibtex +@inproceedings{liuLQWTcvpr16DeepFashion, + author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou}, + title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations}, + booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2016} +} +``` + +
+ + + +
+DeepFashion (ECCV'2016) + +```bibtex +@inproceedings{liuYLWTeccv16FashionLandmark, + author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou}, + title = {Fashion Landmark Detection in the Wild}, + booktitle = {European Conference on Computer Vision (ECCV)}, + month = {October}, + year = {2016} + } +``` + +
+ +Results on DeepFashion val set + +| Set | Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :---- | :-------------------------------------------------------: | :--------: | :-----: | :--: | :--: | :-------------------------------------------------------: | :------------------------------------------------------: | +| upper | [pose_resnet_50](td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py) | 256x192 | 95.4 | 57.8 | 16.8 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_upper_256x192-41794f03_20210124.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_upper_256x192_20210124.log.json) | +| lower | [pose_resnet_50](td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py) | 256x192 | 96.5 | 74.4 | 10.5 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_lower_256x192-1292a839_20210124.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_lower_256x192_20210124.log.json) | +| full | [pose_resnet_50](td-hm_res50_8xb64-210e_deepfashion_full-256x192.py) | 256x192 | 97.7 | 66.4 | 12.7 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_full_256x192-0dbd6e42_20210124.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_full_256x192_20210124.log.json) | + +Note: Due to the time constraints, we have only trained resnet50 models. We warmly welcome any contributions if you can successfully reproduce the results from the paper! diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.yml b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.yml new file mode 100644 index 0000000..228c49c --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.yml @@ -0,0 +1,51 @@ +Collections: +- Name: SimpleBaseline2D + Paper: + Title: Simple baselines for human pose estimation and tracking + URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html + README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md +Models: +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: DeepFashion + Name: td-hm_res50_8xb64-210e_deepfashion_upper-256x192 + Results: + - Dataset: DeepFashion + Metrics: + AUC: 57.8 + EPE: 16.8 + PCK@0.2: 95.4 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_upper_256x192-41794f03_20210124.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion + Name: td-hm_res50_8xb64-210e_deepfashion_lower-256x192 + Results: + - Dataset: DeepFashion + Metrics: + AUC: 74.4 + EPE: 96.5 + PCK@0.2: 10.5 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_lower_256x192-1292a839_20210124.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_full-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion + Name: td-hm_res50_8xb64-210e_deepfashion_full-256x192 + Results: + - Dataset: DeepFashion + Metrics: + AUC: 66.4 + EPE: 12.7 + PCK@0.2: 97.7 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_full_256x192-0dbd6e42_20210124.pth diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py new file mode 100644 index 0000000..8044431 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py @@ -0,0 +1,169 @@ +_base_ = '../../../_base_/default_runtime.py' + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=8, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashionDataset' +data_mode = 'topdown' +data_root = 'data/fld/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +test_pipeline = val_pipeline + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + subset='full', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_full_train.json', + data_prefix=dict(img='img/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='full', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_full_val.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='full', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_full_test.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=test_pipeline, + )) + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py new file mode 100644 index 0000000..a89cbb5 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py @@ -0,0 +1,169 @@ +_base_ = '../../../_base_/default_runtime.py' + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=4, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashionDataset' +data_mode = 'topdown' +data_root = 'data/fld/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +test_pipeline = val_pipeline + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + subset='lower', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_lower_train.json', + data_prefix=dict(img='img/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='lower', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_lower_val.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='lower', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_lower_test.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=test_pipeline, + )) + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py new file mode 100644 index 0000000..b6b8cec --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py @@ -0,0 +1,169 @@ +_base_ = '../../../_base_/default_runtime.py' + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=6, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashionDataset' +data_mode = 'topdown' +data_root = 'data/fld/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +test_pipeline = val_pipeline + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + subset='upper', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_upper_train.json', + data_prefix=dict(img='img/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='upper', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_upper_val.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='upper', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_upper_test.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=test_pipeline, + )) + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_full-256x192.py new file mode 100644 index 0000000..0edccd2 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_full-256x192.py @@ -0,0 +1,26 @@ +_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py' + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +model = dict( + test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False)) + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_lower-256x192.py new file mode 100644 index 0000000..e400284 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_lower-256x192.py @@ -0,0 +1,26 @@ +_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py' + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +model = dict( + test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False)) + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_upper-256x192.py new file mode 100644 index 0000000..2003311 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_upper-256x192.py @@ -0,0 +1,26 @@ +_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py' + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +model = dict( + test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False)) + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_full-256x192.py new file mode 100644 index 0000000..f28a128 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_full-256x192.py @@ -0,0 +1,42 @@ +_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py' + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +model = dict( + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict(in_channels=48)) + +train_dataloader = dict(batch_size=32) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_lower-256x192.py new file mode 100644 index 0000000..e823833 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_lower-256x192.py @@ -0,0 +1,42 @@ +_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py' # noqa + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +model = dict( + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict(in_channels=48)) + +train_dataloader = dict(batch_size=32) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_upper-256x192.py new file mode 100644 index 0000000..a819a55 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_upper-256x192.py @@ -0,0 +1,42 @@ +_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py' # noqa + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +model = dict( + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict(in_channels=48)) + +train_dataloader = dict(batch_size=32) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py new file mode 100644 index 0000000..ad0e2e0 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py @@ -0,0 +1,31 @@ +_base_ = './td-hm_hrnet-w48_8xb32-210e_deepfashion_full-256x192.py' + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +model = dict( + test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False)) + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_dataloader = dict(batch_size=32) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py new file mode 100644 index 0000000..91c6ec2 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py @@ -0,0 +1,31 @@ +_base_ = './td-hm_hrnet-w48_8xb32-210e_deepfashion_lower-256x192.py' + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +model = dict( + test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False)) + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_dataloader = dict(batch_size=32) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192.py new file mode 100644 index 0000000..ba707f6 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192.py @@ -0,0 +1,31 @@ +_base_ = './td-hm_hrnet-w48_8xb32-210e_deepfashion_upper-256x192.py' + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +model = dict( + test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False)) + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_dataloader = dict(batch_size=32) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_full-256x192.py new file mode 100644 index 0000000..15e5b54 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_full-256x192.py @@ -0,0 +1,8 @@ +_base_ = './td-hm_res50_8xb64-210e_deepfashion_full-256x192.py' + +model = dict( + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_lower-256x192.py new file mode 100644 index 0000000..78a3879 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_lower-256x192.py @@ -0,0 +1,8 @@ +_base_ = './td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py' + +model = dict( + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_upper-256x192.py new file mode 100644 index 0000000..449ef14 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_upper-256x192.py @@ -0,0 +1,8 @@ +_base_ = './td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py' + +model = dict( + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_full-256x192.py new file mode 100644 index 0000000..a9a79d8 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_full-256x192.py @@ -0,0 +1,13 @@ +_base_ = './td-hm_res50_8xb64-210e_deepfashion_full-256x192.py' + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +model = dict( + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet152'))) + +train_dataloader = dict(batch_size=32) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_lower-256x192.py new file mode 100644 index 0000000..9d1cca6 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_lower-256x192.py @@ -0,0 +1,13 @@ +_base_ = './td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py' + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +model = dict( + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet152'))) + +train_dataloader = dict(batch_size=32) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_upper-256x192.py new file mode 100644 index 0000000..68f85cf --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_upper-256x192.py @@ -0,0 +1,13 @@ +_base_ = './td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py' + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +model = dict( + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet152'))) + +train_dataloader = dict(batch_size=32) diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_full-256x192.py new file mode 100644 index 0000000..e883794 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_full-256x192.py @@ -0,0 +1,140 @@ +_base_ = '../../../_base_/default_runtime.py' + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=8, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashionDataset' +data_mode = 'topdown' +data_root = 'data/fld/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +test_pipeline = val_pipeline + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + subset='full', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_full_train.json', + data_prefix=dict(img='img/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='full', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_full_val.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='full', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_full_test.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=test_pipeline, + )) + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py new file mode 100644 index 0000000..8723ab7 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py @@ -0,0 +1,140 @@ +_base_ = '../../../_base_/default_runtime.py' + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=64) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=4, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashionDataset' +data_mode = 'topdown' +data_root = 'data/fld/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +test_pipeline = val_pipeline + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + subset='lower', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_lower_train.json', + data_prefix=dict(img='img/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='lower', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_lower_val.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='lower', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_lower_test.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=test_pipeline, + )) + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py new file mode 100644 index 0000000..9541987 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py @@ -0,0 +1,140 @@ +_base_ = '../../../_base_/default_runtime.py' + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=64) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=6, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashionDataset' +data_mode = 'topdown' +data_root = 'data/fld/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +test_pipeline = val_pipeline + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + subset='upper', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_upper_train.json', + data_prefix=dict(img='img/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='upper', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_upper_val.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + subset='upper', + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/fld_upper_test.json', + data_prefix=dict(img='img/'), + test_mode=True, + pipeline=test_pipeline, + )) + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfashion2.md b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfashion2.md new file mode 100644 index 0000000..c19eced --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfashion2.md @@ -0,0 +1,67 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+DeepFashion2 (CVPR'2019) + +```bibtex +@article{DeepFashion2, + author = {Yuying Ge and Ruimao Zhang and Lingyun Wu and Xiaogang Wang and Xiaoou Tang and Ping Luo}, + title={A Versatile Benchmark for Detection, Pose Estimation, Segmentation and Re-Identification of Clothing Images}, + journal={CVPR}, + year={2019} +} +``` + +
+ +Results on DeepFashion2 val set + +| Set | Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :-------------------- | :-------------------------------------------------: | :--------: | :-----: | :---: | :--: | :-------------------------------------------------: | :-------------------------------------------------: | +| short_sleeved_shirt | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py) | 256x192 | 0.988 | 0.703 | 10.2 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_shirt_256x192-21e1c5da_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_shirt_256x192_20221208.log.json) | +| long_sleeved_shirt | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py) | 256x192 | 0.973 | 0.587 | 16.6 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_shirt_256x192-8679e7e3_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_shirt_256x192_20221208.log.json) | +| short_sleeved_outwear | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py) | 256x192 | 0.966 | 0.408 | 24.0 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_outwear_256x192-a04c1298_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_outwear_256x192_20221208.log.json) | +| long_sleeved_outwear | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py) | 256x192 | 0.987 | 0.517 | 18.1 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_outwear_256x192-31fbaecf_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_outwear_256x192_20221208.log.json) | +| vest | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py) | 256x192 | 0.981 | 0.643 | 12.7 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_256x192-4c48d05c_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_256x192_20221208.log.json) | +| sling | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py) | 256x192 | 0.940 | 0.557 | 21.6 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_256x192-ebb2b736_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_256x192_20221208.log.json) | +| shorts | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py) | 256x192 | 0.975 | 0.682 | 12.4 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_shorts_256x192-9ab23592_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_shorts_256x192_20221208.log.json) | +| trousers | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py) | 256x192 | 0.973 | 0.625 | 14.8 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_trousers_256x192-3e632257_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_trousers_256x192_20221208.log.json) | +| skirt | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py) | 256x192 | 0.952 | 0.653 | 16.6 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_skirt_256x192-09573469_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_skirt_256x192_20221208.log.json) | +| short_sleeved_dress | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py) | 256x192 | 0.980 | 0.603 | 15.6 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_dress_256x192-1345b07a_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_dress_256x192_20221208.log.json) | +| long_sleeved_dress | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py) | 256x192 | 0.976 | 0.518 | 20.1 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_dress_256x192-87bac74e_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_dress_256x192_20221208.log.json) | +| vest_dress | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py) | 256x192 | 0.980 | 0.600 | 16.0 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_dress_256x192-fb3fbd6f_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_dress_256x192_20221208.log.json) | +| sling_dress | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py) | 256x192 | 0.967 | 0.544 | 19.5 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_dress_256x192-8ebae0eb_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_dress_256x192_20221208.log.json) | diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfasion2.yml b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfasion2.yml new file mode 100644 index 0000000..61b8a65 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfasion2.yml @@ -0,0 +1,185 @@ +Models: +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + - ResNet + Training Data: DeepFashion2 + Name: td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.703 + EPE: 10.2 + PCK@0.2: 0.988 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_shirt_256x192-21e1c5da_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.587 + EPE: 16.5 + PCK@0.2: 0.973 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_shirt_256x192-8679e7e3_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.408 + EPE: 24.0 + PCK@0.2: 0.966 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_outwear_256x192-a04c1298_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.517 + EPE: 18.1 + PCK@0.2: 0.987 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_outwear_256x192-31fbaecf_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_4xb64-210e_deepfasion2-vest-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.643 + EPE: 12.7 + PCK@0.2: 0.981 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_256x192-4c48d05c_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_4xb64-210e_deepfasion2-sling-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.557 + EPE: 21.6 + PCK@0.2: 0.94 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_256x192-ebb2b736_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.682 + EPE: 12.4 + PCK@0.2: 0.975 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_shorts_256x192-9ab23592_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.625 + EPE: 14.8 + PCK@0.2: 0.973 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_trousers_256x192-3e632257_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.653 + EPE: 16.6 + PCK@0.2: 0.952 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_skirt_256x192-09573469_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.603 + EPE: 15.6 + PCK@0.2: 0.98 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_dress_256x192-1345b07a_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.518 + EPE: 20.1 + PCK@0.2: 0.976 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_dress_256x192-87bac74e_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.6 + EPE: 16.0 + PCK@0.2: 0.98 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_dress_256x192-fb3fbd6f_20221208.pth +- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: DeepFashion2 + Name: td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192 + Results: + - Dataset: DeepFashion2 + Metrics: + AUC: 0.544 + EPE: 19.5 + PCK@0.2: 0.967 + Task: Fashion 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_dress_256x192-8ebae0eb_20221208.pth diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py new file mode 100644 index 0000000..437b9aa --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=64) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_long_sleeved_dress_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_long_sleeved_dress_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py new file mode 100644 index 0000000..3b8ec62 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=64) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_skirt_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_skirt_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py new file mode 100644 index 0000000..1883314 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=64) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_vest_dress_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_vest_dress_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py new file mode 100644 index 0000000..a5f6637 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=128) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_trousers_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_trousers_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py new file mode 100644 index 0000000..0a00361 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=192) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_shorts_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_shorts_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py new file mode 100644 index 0000000..d865565 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_short_sleeved_dress_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_short_sleeved_dress_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py new file mode 100644 index 0000000..eb42c72 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_sling_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_sling_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py new file mode 100644 index 0000000..8d206f3 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_sling_dress_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_sling_dress_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py new file mode 100644 index 0000000..c0ed06d --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_vest_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_vest_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py new file mode 100644 index 0000000..e1bbbe2 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=384) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_short_sleeved_shirt_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_short_sleeved_shirt_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py new file mode 100644 index 0000000..2b36f62 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py @@ -0,0 +1,123 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_long_sleeved_outwear_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/' + 'deepfashion2_long_sleeved_outwear_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py new file mode 100644 index 0000000..8d25b31 --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_long_sleeved_shirt_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/deepfashion2_long_sleeved_shirt_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py new file mode 100644 index 0000000..9e381df --- /dev/null +++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py @@ -0,0 +1,123 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=294, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'DeepFashion2Dataset' +data_mode = 'topdown' +data_root = 'data/deepfasion2/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='train/deepfashion2_short_sleeved_outwear_train.json', + data_prefix=dict(img='train/image/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='validation/' + 'deepfashion2_short_sleeved_outwear_validation.json', + data_prefix=dict(img='validation/image/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/README.md b/modules/rtmpose/configs/hand_2d_keypoint/README.md new file mode 100644 index 0000000..29b9066 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/README.md @@ -0,0 +1,18 @@ +# 2D Hand Pose Estimation + +2D hand pose estimation is defined as the task of detecting the poses (or keypoints) of the hand from an input image. + +Normally, the input images are cropped hand images, where the hand locates at the center; +or the rough location (or the bounding box) of the hand is provided. + +## Data preparation + +Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_hand_keypoint.md) to prepare data. + +## Demo + +Please follow [Demo](/demo/docs/en/2d_hand_demo.md) to run demos. + +
+ +
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000..2da6481 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/README.md @@ -0,0 +1,16 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### COCO-WholeBody-Hand Dataset + +Results on COCO-WholeBody-Hand val set + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :-------: | :--------: | :-----: | :---: | :--: | :------------------------------------------------------------------------------------: | +| RTMPose-m | 256x256 | 0.815 | 0.837 | 4.51 | [rtmpose_coco_wholebody_hand.md](./coco_wholebody_hand/rtmpose_coco_wholebody_hand.md) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000..2199e09 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=21, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.md new file mode 100644 index 0000000..edf0819 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.md @@ -0,0 +1,39 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO-WholeBody-Hand (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Hand val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [rtmpose_m](/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.815 | 0.837 | 4.51 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-hand_pt-aic-coco_210e-256x256-99477206_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-hand_pt-aic-coco_210e-256x256-99477206_20230228.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.yml new file mode 100644 index 0000000..2fb0378 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py + In Collection: RTMPose + Metadata: + Architecture: + - RTMPose + Training Data: COCO-WholeBody-Hand + Name: rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256 + Results: + - Dataset: COCO-WholeBody-Hand + Metrics: + AUC: 0.815 + EPE: 4.51 + PCK@0.2: 0.837 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-hand_pt-aic-coco_210e-256x256-99477206_20230228.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose-m_8xb256-210e_hand5-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose-m_8xb256-210e_hand5-256x256.py new file mode 100644 index 0000000..96b839f --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose-m_8xb256-210e_hand5-256x256.py @@ -0,0 +1,380 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# coco-hand onehand10k freihand2d rhd2d halpehand + +# runtime +max_epochs = 210 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=21, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.2), + dict(type='MedianBlur', p=0.2), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_onehand10k = dict( + type='OneHand10KDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='onehand10k/annotations/onehand10k_train.json', + data_prefix=dict(img='pose/OneHand10K/'), + pipeline=[], +) + +dataset_freihand = dict( + type='FreiHandDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='freihand/annotations/freihand_train.json', + data_prefix=dict(img='pose/FreiHand/'), + pipeline=[], +) + +dataset_rhd = dict( + type='Rhd2DDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='rhd/annotations/rhd_train.json', + data_prefix=dict(img='pose/RHD/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=21, + mapping=[ + (0, 0), + (1, 4), + (2, 3), + (3, 2), + (4, 1), + (5, 8), + (6, 7), + (7, 6), + (8, 5), + (9, 12), + (10, 11), + (11, 10), + (12, 9), + (13, 16), + (14, 15), + (15, 14), + (16, 13), + (17, 20), + (18, 19), + (19, 18), + (20, 17), + ]) + ], +) + +dataset_halpehand = dict( + type='HalpeHandDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015/'), + pipeline=[], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict( + from_file='configs/_base_/datasets/coco_wholebody_hand.py'), + datasets=[ + dataset_coco, dataset_onehand10k, dataset_freihand, dataset_rhd, + dataset_halpehand + ], + pipeline=train_pipeline, + test_mode=False, + )) + +# test datasets +val_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[], +) + +val_onehand10k = dict( + type='OneHand10KDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='onehand10k/annotations/onehand10k_test.json', + data_prefix=dict(img='pose/OneHand10K/'), + pipeline=[], +) + +val_freihand = dict( + type='FreiHandDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='freihand/annotations/freihand_test.json', + data_prefix=dict(img='pose/FreiHand/'), + pipeline=[], +) + +val_rhd = dict( + type='Rhd2DDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='rhd/annotations/rhd_test.json', + data_prefix=dict(img='pose/RHD/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=21, + mapping=[ + (0, 0), + (1, 4), + (2, 3), + (3, 2), + (4, 1), + (5, 8), + (6, 7), + (7, 6), + (8, 5), + (9, 12), + (10, 11), + (11, 10), + (12, 9), + (13, 16), + (14, 15), + (15, 14), + (16, 13), + (17, 20), + (18, 19), + (19, 18), + (20, 17), + ]) + ], +) + +val_halpehand = dict( + type='HalpeHandDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_val_v1.json', + data_prefix=dict(img='detection/coco/val2017/'), + pipeline=[], +) + +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CombinedDataset', + metainfo=dict( + from_file='configs/_base_/datasets/coco_wholebody_hand.py'), + datasets=[ + val_coco, val_onehand10k, val_freihand, val_rhd, val_halpehand + ], + pipeline=val_pipeline, + test_mode=True, + )) + +val_dataloader = test_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.md b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.md new file mode 100644 index 0000000..5da110e --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.md @@ -0,0 +1,67 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +- `Hand5` and `*` denote model trained on 5 public datasets: + - [COCO-Wholebody-Hand](https://github.com/jin-s13/COCO-WholeBody/) + - [OneHand10K](https://www.yangangwang.com/papers/WANG-MCC-2018-10.html) + - [FreiHand2d](https://lmb.informatik.uni-freiburg.de/projects/freihand/) + - [RHD2d](https://lmb.informatik.uni-freiburg.de/resources/datasets/RenderedHandposeDataset.en.html) + - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe) + +| Config | Input Size | PCK@0.2
(COCO-Wholebody-Hand) | PCK@0.2
(Hand5) | AUC
(Hand5) | EPE
(Hand5) | FLOPS(G) | Download | +| :---------------------------------------: | :--------: | :-----------------------------------: | :---------------------: | :-----------------: | :-----------------: | :------: | :-----------------------------------------: | +| [RTMPose-m\*
(alpha version)](./rtmpose/hand_2d_keypoint/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 81.5 | 96.4 | 83.9 | 5.06 | 2.581 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-hand5_pt-aic-coco_210e-256x256-74fb594_20230320.pth) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.yml b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.yml new file mode 100644 index 0000000..a57b36c --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.yml @@ -0,0 +1,28 @@ +Collections: +- Name: RTMPose + Paper: + Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose" + URL: https://arxiv.org/abs/2303.07399 + README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md +Models: +- Config: configs/hand_2d_keypoint/rtmpose/hand5/rtmpose-m_8xb256-210e_hand5-256x256.py + In Collection: RTMPose + Alias: hand + Metadata: + Architecture: &id001 + - RTMPose + Training Data: &id002 + - COCO-Wholebody-Hand + - OneHand10K + - FreiHand2d + - RHD2d + - Halpe + Name: rtmpose-m_8xb256-210e_hand5-256x256 + Results: + - Dataset: Hand5 + Metrics: + PCK@0.2: 0.964 + AUC: 0.839 + EPE: 5.06 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-hand5_pt-aic-coco_210e-256x256-74fb594_20230320.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/README.md new file mode 100644 index 0000000..969482a --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/README.md @@ -0,0 +1,55 @@ +# Top-down heatmap-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html). + +
+ +
+ +## Results and Models + +### COCO-WholeBody-Hand Dataset + +Results on COCO-WholeBody-Hand val set + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :--------------: | :--------: | :-----: | :---: | :--: | :----------------------------------------------------------------------------------------------: | +| HRNetv2-w18+Dark | 256x256 | 0.814 | 0.840 | 4.37 | [hrnetv2_dark_coco_wholebody_hand.md](./coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md) | +| HRNetv2-w18 | 256x256 | 0.813 | 0.840 | 4.39 | [hrnetv2_coco_wholebody_hand.md](./coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md) | +| HourglassNet | 256x256 | 0.804 | 0.835 | 4.54 | [hourglass_coco_wholebody_hand.md](./coco_wholebody_hand/hourglass_coco_wholebody_hand.md) | +| SCNet-50 | 256x256 | 0.803 | 0.834 | 4.55 | [scnet_coco_wholebody_hand.md](./coco_wholebody_hand/scnet_coco_wholebody_hand.md) | +| ResNet-50 | 256x256 | 0.800 | 0.833 | 4.64 | [resnet_coco_wholebody_hand.md](./coco_wholebody_hand/resnet_coco_wholebody_hand.md) | +| LiteHRNet-18 | 256x256 | 0.795 | 0.830 | 4.77 | [litehrnet_coco_wholebody_hand.md](./coco_wholebody_hand/litehrnet_coco_wholebody_hand.md) | +| MobileNet-v2 | 256x256 | 0.795 | 0.829 | 4.77 | [mobilenetv2_coco_wholebody_hand.md](./coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md) | + +### FreiHand Dataset + +Results on FreiHand val & test set + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :-------: | :--------: | :-----: | :---: | :--: | :-------------------------------------------------------: | +| ResNet-50 | 224x224 | 0.999 | 0.868 | 3.27 | [resnet_freihand2d.md](./freihand2d/resnet_freihand2d.md) | + +### OneHand10K Dataset + +Results on OneHand10K val set + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :--------------: | :--------: | :-----: | :---: | :---: | :-------------------------------------------------------------------: | +| HRNetv2-w18+Dark | 256x256 | 0.990 | 0.572 | 23.96 | [hrnetv2_dark_onehand10k.md](./onehand10k/hrnetv2_dark_onehand10k.md) | +| HRNetv2-w18+UDP | 256x256 | 0.990 | 0.571 | 23.88 | [hrnetv2_udp_onehand10k.md](./onehand10k/hrnetv2_udp_onehand10k.md) | +| HRNetv2-w18 | 256x256 | 0.990 | 0.567 | 24.26 | [hrnetv2_onehand10k.md](./onehand10k/hrnetv2_onehand10k.md) | +| ResNet-50 | 256x256 | 0.989 | 0.555 | 25.16 | [resnet_onehand10k.md](./onehand10k/resnet_onehand10k.md) | +| MobileNet-v2 | 256x256 | 0.986 | 0.537 | 28.56 | [mobilenetv2_onehand10k.md](./onehand10k/mobilenetv2_onehand10k.md) | + +### RHD Dataset + +Results on RHD test set + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :--------------: | :--------: | :-----: | :---: | :--: | :----------------------------------------------------: | +| HRNetv2-w18+Dark | 256x256 | 0.992 | 0.903 | 2.18 | [hrnetv2_dark_rhd2d.md](./rhd2d/hrnetv2_dark_rhd2d.md) | +| HRNetv2-w18+UDP | 256x256 | 0.992 | 0.902 | 2.19 | [hrnetv2_udp_rhd2d.md](./rhd2d/hrnetv2_udp_rhd2d.md) | +| HRNetv2-w18 | 256x256 | 0.992 | 0.902 | 2.21 | [hrnetv2_rhd2d.md](./rhd2d/hrnetv2_rhd2d.md) | +| ResNet-50 | 256x256 | 0.991 | 0.898 | 2.32 | [resnet_rhd2d.md](./rhd2d/resnet_rhd2d.md) | +| MobileNet-v2 | 256x256 | 0.985 | 0.883 | 2.79 | [mobilenetv2_rhd2d.md](./rhd2d/mobilenetv2_rhd2d.md) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.md new file mode 100644 index 0000000..2926593 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.md @@ -0,0 +1,39 @@ + + +
+Hourglass (ECCV'2016) + +```bibtex +@inproceedings{newell2016stacked, + title={Stacked hourglass networks for human pose estimation}, + author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia}, + booktitle={European conference on computer vision}, + pages={483--499}, + year={2016}, + organization={Springer} +} +``` + +
+ + + +
+COCO-WholeBody-Hand (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Hand val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_hourglass_52](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.804 | 0.835 | 4.54 | [ckpt](https://download.openmmlab.com/mmpose/hand/hourglass/hourglass52_coco_wholebody_hand_256x256-7b05c6db_20210909.pth) | [log](https://download.openmmlab.com/mmpose/hand/hourglass/hourglass52_coco_wholebody_hand_256x256_20210909.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml new file mode 100644 index 0000000..21ff3f0 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py + In Collection: Hourglass + Metadata: + Architecture: + - Hourglass + Training Data: COCO-WholeBody-Hand + Name: td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256 + Results: + - Dataset: COCO-WholeBody-Hand + Metrics: + AUC: 0.835 + EPE: 4.54 + PCK@0.2: 0.804 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/hourglass/hourglass52_coco_wholebody_hand_256x256-7b05c6db_20210909.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md new file mode 100644 index 0000000..eae4dce --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md @@ -0,0 +1,39 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+COCO-WholeBody-Hand (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Hand val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_hrnetv2_w18](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.813 | 0.840 | 4.39 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_coco_wholebody_hand_256x256_20210908.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml new file mode 100644 index 0000000..0190ac9 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + Training Data: COCO-WholeBody-Hand + Name: td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256 + Results: + - Dataset: COCO-WholeBody-Hand + Metrics: + AUC: 0.84 + EPE: 4.39 + PCK@0.2: 0.813 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md new file mode 100644 index 0000000..718baa7 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md @@ -0,0 +1,56 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+COCO-WholeBody-Hand (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Hand val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_hrnetv2_w18_dark](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.814 | 0.840 | 4.37 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_coco_wholebody_hand_256x256_dark-a9228c9c_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_coco_wholebody_hand_256x256_dark_20210908.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml new file mode 100644 index 0000000..f5b275a --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py + In Collection: DarkPose + Metadata: + Architecture: + - HRNetv2 + - DarkPose + Training Data: COCO-WholeBody-Hand + Name: td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256 + Results: + - Dataset: COCO-WholeBody-Hand + Metrics: + AUC: 0.84 + EPE: 4.37 + PCK@0.2: 0.814 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_coco_wholebody_hand_256x256_dark-a9228c9c_20210908.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.md new file mode 100644 index 0000000..1508d86 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.md @@ -0,0 +1,37 @@ + + +
+LiteHRNet (CVPR'2021) + +```bibtex +@inproceedings{Yulitehrnet21, + title={Lite-HRNet: A Lightweight High-Resolution Network}, + author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong}, + booktitle={CVPR}, + year={2021} +} +``` + +
+ + + +
+COCO-WholeBody-Hand (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Hand val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [LiteHRNet-18](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.795 | 0.830 | 4.77 | [ckpt](https://download.openmmlab.com/mmpose/hand/litehrnet/litehrnet_w18_coco_wholebody_hand_256x256-d6945e6a_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/litehrnet/litehrnet_w18_coco_wholebody_hand_256x256_20210908.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml new file mode 100644 index 0000000..66c5713 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py + In Collection: LiteHRNet + Metadata: + Architecture: + - LiteHRNet + Training Data: COCO-WholeBody-Hand + Name: td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256 + Results: + - Dataset: COCO-WholeBody-Hand + Metrics: + AUC: 0.83 + EPE: 4.77 + PCK@0.2: 0.795 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/litehrnet/litehrnet_w18_coco_wholebody_hand_256x256-d6945e6a_20210908.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md new file mode 100644 index 0000000..6b65bd0 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md @@ -0,0 +1,38 @@ + + +
+MobilenetV2 (CVPR'2018) + +```bibtex +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +
+ + + +
+COCO-WholeBody-Hand (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Hand val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------: | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_mobilenetv2](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.795 | 0.829 | 4.77 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_coco_wholebody_hand_256x256-06b8c877_20210909.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_coco_wholebody_hand_256x256_20210909.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml new file mode 100644 index 0000000..cc8c5a2 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - MobilenetV2 + Training Data: COCO-WholeBody-Hand + Name: td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256 + Results: + - Dataset: COCO-WholeBody-Hand + Metrics: + AUC: 0.829 + EPE: 4.77 + PCK@0.2: 0.795 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_coco_wholebody_hand_256x256-06b8c877_20210909.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.md new file mode 100644 index 0000000..21693f1 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.md @@ -0,0 +1,55 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+COCO-WholeBody-Hand (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Hand val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------: | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_resnet_50](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.800 | 0.833 | 4.64 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_coco_wholebody_hand_256x256-8dbc750c_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_coco_wholebody_hand_256x256_20210908.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml new file mode 100644 index 0000000..b663c5d --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - ResNet + Training Data: COCO-WholeBody-Hand + Name: td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256 + Results: + - Dataset: COCO-WholeBody-Hand + Metrics: + AUC: 0.833 + EPE: 4.64 + PCK@0.2: 0.8 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_coco_wholebody_hand_256x256-8dbc750c_20210908.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.md new file mode 100644 index 0000000..1cf44e2 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.md @@ -0,0 +1,38 @@ + + +
+SCNet (CVPR'2020) + +```bibtex +@inproceedings{liu2020improving, + title={Improving Convolutional Networks with Self-Calibrated Convolutions}, + author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={10096--10105}, + year={2020} +} +``` + +
+ + + +
+COCO-WholeBody-Hand (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Hand val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------: | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_scnet_50](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.803 | 0.834 | 4.55 | [ckpt](https://download.openmmlab.com/mmpose/hand/scnet/scnet50_coco_wholebody_hand_256x256-e73414c7_20210909.pth) | [log](https://download.openmmlab.com/mmpose/hand/scnet/scnet50_coco_wholebody_hand_256x256_20210909.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml new file mode 100644 index 0000000..0fd05eb --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SCNet + Training Data: COCO-WholeBody-Hand + Name: td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256 + Results: + - Dataset: COCO-WholeBody-Hand + Metrics: + AUC: 0.834 + EPE: 4.55 + PCK@0.2: 0.803 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/scnet/scnet50_coco_wholebody_hand_256x256-e73414c7_20210909.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000..05b1ad1 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,123 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HourglassNet', + num_stacks=1, + ), + head=dict( + type='CPMHead', + in_channels=256, + out_channels=21, + num_stages=1, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + rotate_factor=180.0, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000..be8d278 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=21, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000..1c0f1c3 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,158 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=21, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000..c160687 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,136 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='LiteHRNet', + in_channels=3, + extra=dict( + stem=dict(stem_channels=32, out_channels=32, expand_ratio=1), + num_stages=3, + stages_spec=dict( + num_modules=(2, 4, 2), + num_branches=(2, 3, 4), + num_blocks=(2, 2, 2), + module_type=('LITE', 'LITE', 'LITE'), + with_fuse=(True, True, True), + reduce_ratios=(8, 8, 8), + num_channels=( + (40, 80), + (40, 80, 160), + (40, 80, 160, 320), + )), + with_head=True, + )), + head=dict( + type='HeatmapHead', + in_channels=40, + out_channels=21, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000..e68449f --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,120 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MobileNetV2', + widen_factor=1., + out_indices=(7, ), + init_cfg=dict(type='Pretrained', checkpoint='mmcls://mobilenet_v2')), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=21, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000..e7b9ff6 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,119 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=21, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000..f65ea47 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='SCNet', + depth=50, + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/scnet50-7ef0a199.pth')), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=21, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.md new file mode 100644 index 0000000..33a57aa --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.md @@ -0,0 +1,56 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+FreiHand (ICCV'2019) + +```bibtex +@inproceedings{zimmermann2019freihand, + title={Freihand: A dataset for markerless capture of hand pose and shape from single rgb images}, + author={Zimmermann, Christian and Ceylan, Duygu and Yang, Jimei and Russell, Bryan and Argus, Max and Brox, Thomas}, + booktitle={Proceedings of the IEEE International Conference on Computer Vision}, + pages={813--822}, + year={2019} +} +``` + +
+ +Results on FreiHand val & test set + +| Set | Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--- | :-------------------------------------------------------: | :--------: | :-----: | :---: | :--: | :-------------------------------------------------------: | :------------------------------------------------------: | +| test | [pose_resnet_50](/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py) | 224x224 | 0.999 | 0.868 | 3.27 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224-ff0799bc_20200914.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224_20200914.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.yml new file mode 100644 index 0000000..925f440 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - ResNet + Training Data: FreiHand + Name: td-hm_res50_8xb64-100e_freihand2d-224x224 + Results: + - Dataset: FreiHand + Metrics: + AUC: 0.868 + EPE: 3.27 + PCK@0.2: 0.999 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224-ff0799bc_20200914.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py new file mode 100644 index 0000000..677ca31 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py @@ -0,0 +1,138 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=100, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=100, + milestones=[50, 70], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', interval=1)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(224, 224), heatmap_size=(56, 56), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=21, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'FreiHandDataset' +data_mode = 'topdown' +data_root = 'data/freihand/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', + shift_factor=0.25, + rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale', padding=0.8), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/freihand_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/freihand_val.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/freihand_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.md new file mode 100644 index 0000000..88fb8e4 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.md @@ -0,0 +1,60 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+OneHand10K (TCSVT'2019) + +```bibtex +@article{wang2018mask, + title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image}, + author={Wang, Yangang and Peng, Cong and Liu, Yebin}, + journal={IEEE Transactions on Circuits and Systems for Video Technology}, + volume={29}, + number={11}, + pages={3258--3268}, + year={2018}, + publisher={IEEE} +} +``` + +
+ +Results on OneHand10K val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: | +| [pose_hrnetv2_w18_dark](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.990 | 0.572 | 23.96 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark-a2f80c64_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml new file mode 100644 index 0000000..d02795c --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py + In Collection: DarkPose + Metadata: + Architecture: + - HRNetv2 + - DarkPose + Training Data: OneHand10K + Name: td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256 + Results: + - Dataset: OneHand10K + Metrics: + AUC: 0.572 + EPE: 23.96 + PCK@0.2: 0.99 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark-a2f80c64_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.md new file mode 100644 index 0000000..41bed70 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.md @@ -0,0 +1,43 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+OneHand10K (TCSVT'2019) + +```bibtex +@article{wang2018mask, + title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image}, + author={Wang, Yangang and Peng, Cong and Liu, Yebin}, + journal={IEEE Transactions on Circuits and Systems for Video Technology}, + volume={29}, + number={11}, + pages={3258--3268}, + year={2018}, + publisher={IEEE} +} +``` + +
+ +Results on OneHand10K val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: | +| [pose_hrnetv2_w18](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.990 | 0.567 | 24.26 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_onehand10k_256x256-30bc9c6b_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_onehand10k_256x256_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml new file mode 100644 index 0000000..f5ee14c --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + Training Data: OneHand10K + Name: td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256 + Results: + - Dataset: OneHand10K + Metrics: + AUC: 0.567 + EPE: 24.26 + PCK@0.2: 0.99 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_onehand10k_256x256-30bc9c6b_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.md new file mode 100644 index 0000000..0507035 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.md @@ -0,0 +1,60 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+OneHand10K (TCSVT'2019) + +```bibtex +@article{wang2018mask, + title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image}, + author={Wang, Yangang and Peng, Cong and Liu, Yebin}, + journal={IEEE Transactions on Circuits and Systems for Video Technology}, + volume={29}, + number={11}, + pages={3258--3268}, + year={2018}, + publisher={IEEE} +} +``` + +
+ +Results on OneHand10K val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: | +| [pose_hrnetv2_w18_udp](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.990 | 0.571 | 23.88 | [ckpt](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_onehand10k_256x256_udp-0d1b515d_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_onehand10k_256x256_udp_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml new file mode 100644 index 0000000..903f047 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py + In Collection: UDP + Metadata: + Architecture: + - HRNetv2 + - UDP + Training Data: OneHand10K + Name: td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256 + Results: + - Dataset: OneHand10K + Metrics: + AUC: 0.571 + EPE: 23.88 + PCK@0.2: 0.99 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_onehand10k_256x256_udp-0d1b515d_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.md new file mode 100644 index 0000000..b89b1d1 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.md @@ -0,0 +1,42 @@ + + +
+MobilenetV2 (CVPR'2018) + +```bibtex +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +
+ + + +
+OneHand10K (TCSVT'2019) + +```bibtex +@article{wang2018mask, + title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image}, + author={Wang, Yangang and Peng, Cong and Liu, Yebin}, + journal={IEEE Transactions on Circuits and Systems for Video Technology}, + volume={29}, + number={11}, + pages={3258--3268}, + year={2018}, + publisher={IEEE} +} +``` + +
+ +Results on OneHand10K val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: | +| [pose_mobilenet_v2](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.986 | 0.537 | 28.56 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_onehand10k_256x256-f3a3d90e_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_onehand10k_256x256_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml new file mode 100644 index 0000000..4090189 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - MobilenetV2 + Training Data: OneHand10K + Name: td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256 + Results: + - Dataset: OneHand10K + Metrics: + AUC: 0.537 + EPE: 28.56 + PCK@0.2: 0.986 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_onehand10k_256x256-f3a3d90e_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.md new file mode 100644 index 0000000..2498536 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.md @@ -0,0 +1,59 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+OneHand10K (TCSVT'2019) + +```bibtex +@article{wang2018mask, + title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image}, + author={Wang, Yangang and Peng, Cong and Liu, Yebin}, + journal={IEEE Transactions on Circuits and Systems for Video Technology}, + volume={29}, + number={11}, + pages={3258--3268}, + year={2018}, + publisher={IEEE} +} +``` + +
+ +Results on OneHand10K val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: | +| [pose_resnet_50](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py) | 256x256 | 0.989 | 0.555 | 25.16 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256-739c8639_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml new file mode 100644 index 0000000..f30171d --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - ResNet + Training Data: OneHand10K + Name: td-hm_res50_8xb32-210e_onehand10k-256x256 + Results: + - Dataset: OneHand10K + Metrics: + AUC: 0.555 + EPE: 25.16 + PCK@0.2: 0.989 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256-739c8639_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py new file mode 100644 index 0000000..499a11a --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py @@ -0,0 +1,158 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://msra/hrnetv2_w18', + )), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=21, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'OneHand10KDataset' +data_mode = 'topdown' +data_root = 'data/onehand10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py new file mode 100644 index 0000000..08b6588 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py @@ -0,0 +1,162 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://msra/hrnetv2_w18', + )), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=21, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'OneHand10KDataset' +data_mode = 'topdown' +data_root = 'data/onehand10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py new file mode 100644 index 0000000..0dd9402 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py @@ -0,0 +1,158 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://msra/hrnetv2_w18', + )), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=21, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'OneHand10KDataset' +data_mode = 'topdown' +data_root = 'data/onehand10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py new file mode 100644 index 0000000..63f6af3 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MobileNetV2', + widen_factor=1., + out_indices=(7, ), + init_cfg=dict( + type='Pretrained', + checkpoint='mmcls://mobilenet_v2', + )), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=21, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'OneHand10KDataset' +data_mode = 'topdown' +data_root = 'data/onehand10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py new file mode 100644 index 0000000..11b549c --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict( + type='Pretrained', + checkpoint='torchvision://resnet50', + )), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=21, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'OneHand10KDataset' +data_mode = 'topdown' +data_root = 'data/onehand10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.md new file mode 100644 index 0000000..2fc7d85 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.md @@ -0,0 +1,58 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+RHD (ICCV'2017) + +```bibtex +@TechReport{zb2017hand, + author={Christian Zimmermann and Thomas Brox}, + title={Learning to Estimate 3D Hand Pose from Single RGB Images}, + institution={arXiv:1705.01389}, + year={2017}, + note="https://arxiv.org/abs/1705.01389", + url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/" +} +``` + +
+ +Results on RHD test set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_hrnetv2_w18_dark](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.992 | 0.903 | 2.18 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_rhd2d_256x256_dark-4df3a347_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_rhd2d_256x256_dark_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml new file mode 100644 index 0000000..9dde35d --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py + In Collection: DarkPose + Metadata: + Architecture: + - HRNetv2 + - DarkPose + Training Data: RHD + Name: td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256 + Results: + - Dataset: RHD + Metrics: + AUC: 0.903 + EPE: 2.18 + PCK@0.2: 0.992 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_rhd2d_256x256_dark-4df3a347_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.md new file mode 100644 index 0000000..1703e8c --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.md @@ -0,0 +1,41 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+RHD (ICCV'2017) + +```bibtex +@TechReport{zb2017hand, + author={Christian Zimmermann and Thomas Brox}, + title={Learning to Estimate 3D Hand Pose from Single RGB Images}, + institution={arXiv:1705.01389}, + year={2017}, + note="https://arxiv.org/abs/1705.01389", + url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/" +} +``` + +
+ +Results on RHD test set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_hrnetv2_w18](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.992 | 0.902 | 2.21 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256-95b20dd8_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml new file mode 100644 index 0000000..8415f3c --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml @@ -0,0 +1,16 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py + In Collection: HRNetv2 + Metadata: + Architecture: + - HRNetv2 + Training Data: RHD + Name: td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256 + Results: + - Dataset: RHD + Metrics: + AUC: 0.902 + EPE: 2.21 + PCK@0.2: 0.992 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256-95b20dd8_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.md new file mode 100644 index 0000000..da766c4 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.md @@ -0,0 +1,58 @@ + + +
+HRNetv2 (TPAMI'2019) + +```bibtex +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal={TPAMI}, + year={2019} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+RHD (ICCV'2017) + +```bibtex +@TechReport{zb2017hand, + author={Christian Zimmermann and Thomas Brox}, + title={Learning to Estimate 3D Hand Pose from Single RGB Images}, + institution={arXiv:1705.01389}, + year={2017}, + note="https://arxiv.org/abs/1705.01389", + url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/" +} +``` + +
+ +Results on RHD test set + +| Arch | Input Size | PCKh@0.7 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :------: | :---: | :--: | :--------------------------------------------------------: | :-------------------------------------------------------: | +| [pose_hrnetv2_w18_udp](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.992 | 0.902 | 2.19 | [ckpt](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_rhd2d_256x256_udp-63ba6007_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_rhd2d_256x256_udp_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml new file mode 100644 index 0000000..148da23 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py + In Collection: UDP + Metadata: + Architecture: + - HRNetv2 + - UDP + Training Data: RHD + Name: td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256 + Results: + - Dataset: RHD + Metrics: + AUC: 0.902 + EPE: 2.19 + PCKh@0.7: 0.992 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_rhd2d_256x256_udp-63ba6007_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.md new file mode 100644 index 0000000..19506c5 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.md @@ -0,0 +1,40 @@ + + +
+MobilenetV2 (CVPR'2018) + +```bibtex +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +
+ + + +
+RHD (ICCV'2017) + +```bibtex +@TechReport{zb2017hand, + author={Christian Zimmermann and Thomas Brox}, + title={Learning to Estimate 3D Hand Pose from Single RGB Images}, + institution={arXiv:1705.01389}, + year={2017}, + note="https://arxiv.org/abs/1705.01389", + url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/" +} +``` + +
+ +Results on RHD test set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_mobilenet_v2](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.985 | 0.883 | 2.79 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_rhd2d_256x256-85fa02db_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_rhd2d_256x256_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml new file mode 100644 index 0000000..0d1bd76 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - MobilenetV2 + Training Data: RHD + Name: td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256 + Results: + - Dataset: RHD + Metrics: + AUC: 0.883 + EPE: 2.79 + PCK@0.2: 0.985 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_rhd2d_256x256-85fa02db_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.md new file mode 100644 index 0000000..843bd75 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.md @@ -0,0 +1,57 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+RHD (ICCV'2017) + +```bibtex +@TechReport{zb2017hand, + author={Christian Zimmermann and Thomas Brox}, + title={Learning to Estimate 3D Hand Pose from Single RGB Images}, + institution={arXiv:1705.01389}, + year={2017}, + note="https://arxiv.org/abs/1705.01389", + url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/" +} +``` + +
+ +Results on RHD test set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [pose_resnet50](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.991 | 0.898 | 2.32 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_rhd2d_256x256-5dc7e4cc_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_rhd2d_256x256_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.yml new file mode 100644 index 0000000..30cf36b --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: + - SimpleBaseline2D + - ResNet + Training Data: RHD + Name: td-hm_res50_8xb64-210e_rhd2d-256x256 + Results: + - Dataset: RHD + Metrics: + AUC: 0.898 + EPE: 2.32 + PCK@0.2: 0.991 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_rhd2d_256x256-5dc7e4cc_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py new file mode 100644 index 0000000..e9cb89b --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py @@ -0,0 +1,158 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://msra/hrnetv2_w18', + )), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=21, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'Rhd2DDataset' +data_mode = 'topdown' +data_root = 'data/rhd/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py new file mode 100644 index 0000000..eac5595 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py @@ -0,0 +1,162 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(256, 256), + heatmap_size=(64, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://msra/hrnetv2_w18', + )), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=21, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'Rhd2DDataset' +data_mode = 'topdown' +data_root = 'data/rhd/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py new file mode 100644 index 0000000..c2a672b --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py @@ -0,0 +1,158 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(18, 36)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(18, 36, 72)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(18, 36, 72, 144), + multiscale_output=True), + upsample=dict(mode='bilinear', align_corners=False)), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://msra/hrnetv2_w18', + )), + neck=dict( + type='FeatureMapProcessor', + concat=True, + ), + head=dict( + type='HeatmapHead', + in_channels=270, + out_channels=21, + deconv_out_channels=None, + conv_out_channels=(270, ), + conv_kernel_sizes=(1, ), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'Rhd2DDataset' +data_mode = 'topdown' +data_root = 'data/rhd/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py new file mode 100644 index 0000000..68ee857 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py @@ -0,0 +1,125 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='MobileNetV2', + widen_factor=1., + out_indices=(7, ), + init_cfg=dict( + type='Pretrained', + checkpoint='mmcls://mobilenet_v2', + )), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=21, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'Rhd2DDataset' +data_mode = 'topdown' +data_root = 'data/rhd/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py new file mode 100644 index 0000000..07d04dc --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py @@ -0,0 +1,124 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict( + type='Pretrained', + checkpoint='torchvision://resnet50', + )), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=21, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'Rhd2DDataset' +data_mode = 'topdown' +data_root = 'data/rhd/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/README.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/README.md new file mode 100644 index 0000000..2fe838b --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/README.md @@ -0,0 +1,25 @@ +# Top-down regression-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, regression based methods directly regress the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [Deeppose: Human pose estimation via deep neural networks](http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html). + +
+ +
+ +## Results and Models + +### OneHand10K Dataset + +Results on OneHand10K val set + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :-------: | :--------: | :-----: | :---: | :---: | :-------------------------------------------------------: | +| ResNet-50 | 256x256 | 0.990 | 0.485 | 34.21 | [resnet_onehand10k.md](./onehand10k/resnet_onehand10k.md) | + +### RHD Dataset + +Results on RHD test set + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :-------: | :--------: | :-----: | :---: | :--: | :----------------------------------------: | +| ResNet-50 | 256x256 | 0.988 | 0.865 | 3.32 | [resnet_rhd2d.md](./rhd2d/resnet_rhd2d.md) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.md new file mode 100644 index 0000000..9e9e603 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.md @@ -0,0 +1,59 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+OneHand10K (TCSVT'2019) + +```bibtex +@article{wang2018mask, + title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image}, + author={Wang, Yangang and Peng, Cong and Liu, Yebin}, + journal={IEEE Transactions on Circuits and Systems for Video Technology}, + volume={29}, + number={11}, + pages={3258--3268}, + year={2018}, + publisher={IEEE} +} +``` + +
+ +Results on OneHand10K val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: | +| [deeppose_resnet_50](/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.990 | 0.485 | 34.21 | [ckpt](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_onehand10k_256x256-cbddf43a_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_onehand10k_256x256_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.yml new file mode 100644 index 0000000..6b92014 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py + In Collection: DeepPose + Metadata: + Architecture: + - DeepPose + - ResNet + Training Data: OneHand10K + Name: td-reg_res50_8xb64-210e_onehand10k-256x256 + Results: + - Dataset: OneHand10K + Metrics: + AUC: 0.485 + EPE: 34.21 + PCK@0.2: 0.99 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_onehand10k_256x256-cbddf43a_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py new file mode 100644 index 0000000..4d33893 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(256, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=21, + loss=dict(type='SmoothL1Loss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'OneHand10KDataset' +data_mode = 'topdown' +data_root = 'data/onehand10k/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/onehand10k_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.md new file mode 100644 index 0000000..25ae62b --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.md @@ -0,0 +1,57 @@ + + +
+DeepPose (CVPR'2014) + +```bibtex +@inproceedings{toshev2014deeppose, + title={Deeppose: Human pose estimation via deep neural networks}, + author={Toshev, Alexander and Szegedy, Christian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1653--1660}, + year={2014} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+RHD (ICCV'2017) + +```bibtex +@TechReport{zb2017hand, + author={Christian Zimmermann and Thomas Brox}, + title={Learning to Estimate 3D Hand Pose from Single RGB Images}, + institution={arXiv:1705.01389}, + year={2017}, + note="https://arxiv.org/abs/1705.01389", + url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/" +} +``` + +
+ +Results on RHD test set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [deeppose_resnet_50](/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.988 | 0.865 | 3.32 | [ckpt](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_rhd2d_256x256-37f1c4d3_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_rhd2d_256x256_20210330.log.json) | diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.yml new file mode 100644 index 0000000..705329d --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.yml @@ -0,0 +1,17 @@ +Models: +- Config: configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py + In Collection: DeepPose + Metadata: + Architecture: + - DeepPose + - ResNet + Training Data: RHD + Name: td-reg_res50_8xb64-210e_rhd2d-256x256 + Results: + - Dataset: RHD + Metrics: + AUC: 0.865 + EPE: 3.32 + PCK@0.2: 0.988 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_rhd2d_256x256-37f1c4d3_20210330.pth diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py new file mode 100644 index 0000000..7591892 --- /dev/null +++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) + +# codec settings +codec = dict(type='RegressionLabel', input_size=(256, 256)) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='RegressionHead', + in_channels=2048, + num_joints=21, + loss=dict(type='SmoothL1Loss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'Rhd2DDataset' +data_mode = 'topdown' +data_root = 'data/rhd/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict( + type='RandomBBoxTransform', rotate_factor=180, + scale_factor=(0.7, 1.3)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/rhd_test.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE'), +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_3d_keypoint/README.md b/modules/rtmpose/configs/hand_3d_keypoint/README.md new file mode 100644 index 0000000..752fd92 --- /dev/null +++ b/modules/rtmpose/configs/hand_3d_keypoint/README.md @@ -0,0 +1,7 @@ +# 3D Hand Pose Estimation + +3D hand pose estimation is defined as the task of detecting the poses (or keypoints) of the hand from an input image. + +## Data preparation + +Please follow [DATA Preparation](/docs/en/dataset_zoo/3d_hand_keypoint.md) to prepare data. diff --git a/modules/rtmpose/configs/hand_3d_keypoint/internet/README.md b/modules/rtmpose/configs/hand_3d_keypoint/internet/README.md new file mode 100644 index 0000000..d3d1260 --- /dev/null +++ b/modules/rtmpose/configs/hand_3d_keypoint/internet/README.md @@ -0,0 +1,10 @@ +# InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image + +## Results and Models + +### InterHand2.6m 3D Dataset + +| Arch | Set | MPJPE-single | MPJPE-interacting | MPJPE-all | MRRPE | APh | ckpt | log | Details and Download | +| :------------------------------- | :-------: | :----------: | :---------------: | :-------: | :---: | :--: | :------------------------------: | :-----------------------------: | :-----------------------------------------------: | +| [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | test(H+M) | 9.47 | 13.40 | 11.59 | 29.28 | 0.99 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) | [internet_interhand3d.md](./interhand3d/internet_interhand3d.md) | +| [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | val(M) | 11.22 | 15.23 | 13.16 | 31.73 | 0.98 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) | [internet_interhand3d.md](./interhand3d/internet_interhand3d.md) | diff --git a/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.md b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.md new file mode 100644 index 0000000..b706b75 --- /dev/null +++ b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.md @@ -0,0 +1,59 @@ + + +
+InterNet (ECCV'2020) + +```bibtex +@InProceedings{Moon_2020_ECCV_InterHand2.6M, +author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu}, +title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image}, +booktitle = {European Conference on Computer Vision (ECCV)}, +year = {2020} +} +``` + +
+ + + +
+ResNet (CVPR'2016) + +```bibtex +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +
+ + + +
+InterHand2.6M (ECCV'2020) + +```bibtex +@InProceedings{Moon_2020_ECCV_InterHand2.6M, +author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu}, +title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image}, +booktitle = {European Conference on Computer Vision (ECCV)}, +year = {2020} +} +``` + +
+ +Results on InterHand2.6M val & test set + +| Train Set | Set | Arch | Input Size | MPJPE-single | MPJPE-interacting | MPJPE-all | MRRPE | APh | ckpt | log | +| :-------- | :-------- | :----------------------------------------: | :--------: | :----------: | :---------------: | :-------: | :---: | :--: | :----------------------------------------: | :---------------------------------------: | +| All | test(H+M) | [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 9.69 | 13.72 | 11.86 | 29.27 | 0.99 | [ckpt](https://download.openmmlab.com/mmpose/v1/hand_3d_keypoint/internet/interhand3d/internet_res50_interhand3d-d6ff20d6_20230913.pth) | [log](https://download.openmmlab.com/mmpose/v1/hand_3d_keypoint/internet/interhand3d/internet_res50_interhand3d-d6ff20d6_20230913.json) | +| All | val(M) | [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 11.30 | 15.57 | 13.36 | 32.15 | 0.98 | [ckpt](https://download.openmmlab.com/mmpose/v1/hand_3d_keypoint/internet/interhand3d/internet_res50_interhand3d-d6ff20d6_20230913.pth) | [log](https://download.openmmlab.com/mmpose/v1/hand_3d_keypoint/internet/interhand3d/internet_res50_interhand3d-d6ff20d6_20230913.json) | +| All | test(H+M) | [InterNet_resnet_50\*](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 9.47 | 13.40 | 11.59 | 29.28 | 0.99 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) | +| All | val(M) | [InterNet_resnet_50\*](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 11.22 | 15.23 | 13.16 | 31.73 | 0.98 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) | + +*Models with * are trained in [MMPose 0.x](https://github.com/open-mmlab/mmpose/tree/0.x). The checkpoints and logs are only for validation.* diff --git a/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.yml b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.yml new file mode 100644 index 0000000..9446755 --- /dev/null +++ b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.yml @@ -0,0 +1,35 @@ +Collections: +- Name: InterNet + Paper: + Title: 'InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation + from a Single RGB Image' + URL: https://link.springer.com/content/pdf/10.1007/978-3-030-58565-5_33.pdf + README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/internet.md +Models: +- Config: configs/hand_3d_keypoint/internet/interhand3d/internet_res50_4xb16-20e_interhand3d-256x256.py + In Collection: InterNet + Alias: hand3d + Metadata: + Architecture: &id001 + - InterNet + - ResNet + Training Data: InterHand2.6M + Name: internet_res50_4xb16-20e_interhand3d-256x256 + Results: + - Dataset: InterHand2.6M (H+M) + Metrics: + APh: 0.99 + MPJPE-all: 11.86 + MPJPE-interacting: 13.72 + MPJPE-single: 9.69 + MRRPE: 29.27 + Task: Hand 3D Keypoint + - Dataset: InterHand2.6M (M) + Metrics: + APh: 0.98 + MPJPE-all: 13.36 + MPJPE-interacting: 15.57 + MPJPE-single: 11.30 + MRRPE: 32.15 + Task: Hand 3D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth diff --git a/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_res50_4xb16-20e_interhand3d-256x256.py b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_res50_4xb16-20e_interhand3d-256x256.py new file mode 100644 index 0000000..49951e7 --- /dev/null +++ b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_res50_4xb16-20e_interhand3d-256x256.py @@ -0,0 +1,182 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# visualization +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +train_cfg = dict(max_epochs=20, val_interval=1) + +# optimizer +optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.0002)) + +# learning policy +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=20, + milestones=[15, 17], + gamma=0.1, + by_epoch=True) +] + +auto_scale_lr = dict(base_batch_size=128) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=1, + save_best='MPJPE_all', + rule='less', + max_keep_ckpts=1), + logger=dict(type='LoggerHook', interval=20), +) + +# codec settings +codec = dict( + type='Hand3DHeatmap', + image_size=[256, 256], + root_heatmap_size=64, + heatmap_size=[64, 64, 64], + sigma=2.5, + max_bound=255, + depth_size=64) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + head=dict( + type='InternetHead', + keypoint_head_cfg=dict( + in_channels=2048, + out_channels=21 * 64, + depth_size=codec['depth_size'], + deconv_out_channels=(256, 256, 256), + deconv_kernel_sizes=(4, 4, 4), + ), + root_head_cfg=dict( + in_channels=2048, + heatmap_size=codec['root_heatmap_size'], + hidden_dims=(512, ), + ), + hand_type_head_cfg=dict( + in_channels=2048, + num_labels=2, + hidden_dims=(512, ), + ), + decoder=codec), + test_cfg=dict(flip_test=False)) + +# base dataset settings +dataset_type = 'InterHand3DDataset' +data_mode = 'topdown' +data_root = 'data/interhand2.6m/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='HandRandomFlip', prob=0.5), + dict(type='RandomBBoxTransform', rotate_factor=90.0), + dict(type='TopdownAffine', input_size=codec['image_size']), + dict(type='GenerateTarget', encoder=codec), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'rotation', 'img_shape', + 'focal', 'principal_pt', 'input_size', 'input_center', + 'input_scale', 'hand_type', 'hand_type_valid', 'flip', + 'flip_indices', 'abs_depth')) +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['image_size']), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'rotation', 'img_shape', + 'focal', 'principal_pt', 'input_size', 'input_center', + 'input_scale', 'hand_type', 'hand_type_valid', 'flip', + 'flip_indices', 'abs_depth')) +] + +# data loaders +train_dataloader = dict( + batch_size=16, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file='annotations/all/InterHand2.6M_train_data.json', + camera_param_file='annotations/all/InterHand2.6M_train_camera.json', + joint_file='annotations/all/InterHand2.6M_train_joint_3d.json', + use_gt_root_depth=True, + rootnet_result_file=None, + data_mode=data_mode, + data_root=data_root, + data_prefix=dict(img='images/train/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=16, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotations/machine_annot/InterHand2.6M_val_data.json', + camera_param_file='annotations/machine_annot/' + 'InterHand2.6M_val_camera.json', + joint_file='annotations/machine_annot/InterHand2.6M_val_joint_3d.json', + use_gt_root_depth=True, + rootnet_result_file=None, + data_mode=data_mode, + data_root=data_root, + data_prefix=dict(img='images/val/'), + pipeline=val_pipeline, + test_mode=True, + )) +test_dataloader = dict( + batch_size=16, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + ann_file='annotations/all/' + 'InterHand2.6M_test_data.json', + camera_param_file='annotations/all/' + 'InterHand2.6M_test_camera.json', + joint_file='annotations/all/' + 'InterHand2.6M_test_joint_3d.json', + use_gt_root_depth=True, + rootnet_result_file=None, + data_mode=data_mode, + data_root=data_root, + data_prefix=dict(img='images/test/'), + pipeline=val_pipeline, + test_mode=True, + )) + +# evaluators +val_evaluator = [ + dict(type='InterHandMetric', modes=['MPJPE', 'MRRPE', 'HandednessAcc']) +] +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/hand_gesture/README.md b/modules/rtmpose/configs/hand_gesture/README.md new file mode 100644 index 0000000..aaf4235 --- /dev/null +++ b/modules/rtmpose/configs/hand_gesture/README.md @@ -0,0 +1,13 @@ +# Gesture Recognition + +Gesture recognition aims to recognize the hand gestures in the video, such as thumbs up. + +## Data preparation + +Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_hand_gesture.md) to prepare data. + +## Demo + +Please follow [Demo](/demo/docs/en/gesture_recognition_demo.md) to run the demo. + + diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/README.md b/modules/rtmpose/configs/wholebody_2d_keypoint/README.md new file mode 100644 index 0000000..305dd96 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/README.md @@ -0,0 +1,19 @@ +# 2D Human Whole-Body Pose Estimation + +2D human whole-body pose estimation aims to localize dense landmarks on the entire human body including face, hands, body, and feet. + +Existing approaches can be categorized into top-down and bottom-up approaches. + +Top-down methods divide the task into two stages: human detection and whole-body pose estimation. They perform human detection first, followed by single-person whole-body pose estimation given human bounding boxes. + +Bottom-up approaches (e.g. AE) first detect all the whole-body keypoints and then group/associate them into person instances. + +## Data preparation + +Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_wholebody_keypoint.md) to prepare data. + +## Demo + +Please follow [Demo](/demo/docs/en/2d_wholebody_pose_demo.md) to run demos. + +
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/README.md b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/README.md new file mode 100644 index 0000000..7bc9529 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/README.md @@ -0,0 +1,63 @@ +# DWPose + +Whole-body pose estimation localizes the human body, hand, face, and foot keypoints in an image. This task is challenging due to multi-scale body parts, fine-grained localization for low-resolution regions, and data scarcity. Meanwhile, applying a highly efficient and accurate pose estimator to widely human-centric understanding and generation tasks is urgent. In this work, we present a two-stage pose **D**istillation for **W**hole-body **P**ose estimators, named **DWPose**, to improve their effectiveness and efficiency. The first-stage distillation designs a weight-decay strategy while utilizing a teacher's intermediate feature and final logits with both visible and invisible keypoints to supervise the student from scratch. The second stage distills the student model itself to further improve performance. Different from the previous self-knowledge distillation, this stage finetunes the student's head with only 20% training time as a plug-and-play training strategy. For data limitations, we explore the UBody dataset that contains diverse facial expressions and hand gestures for real-life applications. Comprehensive experiments show the superiority of our proposed simple yet effective methods. We achieve new state-of-the-art performance on COCO-WholeBody, significantly boosting the whole-body AP of RTMPose-l from 64.8% to 66.5%, even surpassing RTMPose-x teacher with 65.3% AP. We release a series of models with different sizes, from tiny to large, for satisfying various downstream tasks. + +## Results and Models + +### COCO-WholeBody Dataset + +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +- DWPose Models are supported by [DWPose](https://github.com/IDEA-Research/DWPose) +- Models are trained and distilled on the following datasets: + - [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody/) + - [UBody](https://github.com/IDEA-Research/OSX) + +| Config | S1 Dis_config | S2 Dis_config | Input Size | Whole AP | Whole AR | FLOPS
(G) | ORT-Latency
(ms)
(i7-11700) | TRT-FP16-Latency
(ms)
(GTX 1660Ti) | Download | +| :----------- | :-----------------: | :-----------------: | :--------: | :------: | :------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :------------: | +| [DWPose-t](../rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW l-t](../dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py) | [DW t-t](../dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py) | 256x192 | 48.5 | 58.4 | 0.5 | - | - | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-ucoco_dw-ucoco_270e-256x192-dcf277bf_20230728.pth) | +| [DWPose-s](../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW l-s](../dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py) | [DW s-s](../dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py) | 256x192 | 53.8 | 63.2 | 0.9 | - | - | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-ucoco_dw-ucoco_270e-256x192-3fd922c8_20230728.pth) | +| [DWPose-m](../rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW l-m](../dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py) | [DW m-m](../dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py) | 256x192 | 60.6 | 69.5 | 2.22 | 13.50 | 4.00 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ucoco_dw-ucoco_270e-256x192-c8b76419_20230728.pth) | +| [DWPose-l](../rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW x-l](../dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py) | [DW l-l](../dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py) | 256x192 | 63.1 | 71.7 | 4.52 | 23.41 | 5.67 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth) | +| [DWPose-l](../rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py) | [DW x-l](../dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-384x288.py) | [DW l-l](../dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py) | 384x288 | 66.5 | 74.3 | 10.07 | 44.58 | 7.68 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-384x288-2438fd99_20230728.pth) | + +## Train a model + +### Train DWPose with the first stage distillation + +``` +bash tools/dist_train.sh configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py 8 +``` + +### Tansfer the S1 distillation models into regular models + +``` +# first stage distillation +python pth_transfer.py $dis_ckpt $new_pose_ckpt +``` + +⭐Before S2 distillation, you should add your model path into 'teacher_pretrained' of your S2 dis_config. + +### Train DWPose with the second stage distillation + +``` +bash tools/dist_train.sh configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py 8 +``` + +### Tansfer the S2 distillation models into regular models + +``` +# second stage distillation +python pth_transfer.py $dis_ckpt $new_pose_ckpt --two_dis +``` + +## Citation + +``` +@article{yang2023effective, + title={Effective Whole-body Pose Estimation with Two-stages Distillation}, + author={Yang, Zhendong and Zeng, Ailing and Yuan, Chun and Li, Yu}, + journal={arXiv preprint arXiv:2307.15880}, + year={2023} +} +``` diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py new file mode 100644 index 0000000..999ebfc --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-l_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-m_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=768, + teacher_channels=1024, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py new file mode 100644 index 0000000..0ee0144 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-x_simcc-coco-wholebody_pt-body7_270e-384x288-401dfc90_20230629.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-x_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-l_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=1024, + teacher_channels=1280, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py new file mode 100644 index 0000000..dd0cf5f --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_x_dis_l_coco-384x288/dw-x-l_coco_384.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-l_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-l_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py new file mode 100644 index 0000000..df3626f --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_l_dis_m_coco-256x192/dw-l-m_coco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-m_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/' + 'rtmpose-m_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py new file mode 100644 index 0000000..f7c387d --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_ucoco_256x192-95bb32f5_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=768, + teacher_channels=1024, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py new file mode 100644 index 0000000..3d6c4aa --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_ucoco_256x192-95bb32f5_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=512, + teacher_channels=1024, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py new file mode 100644 index 0000000..5325db9 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_ucoco_256x192-95bb32f5_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=384, + teacher_channels=1024, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py new file mode 100644 index 0000000..b1389d3 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-x_ucoco_256x192-05f5bcb7_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=1024, + teacher_channels=1280, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py new file mode 100644 index 0000000..948116d --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py @@ -0,0 +1,48 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py' # noqa: E501 +] + +# model settings +find_unused_parameters = False + +# config settings +fea = True +logit = True + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-x_ucoco_384x288-f5b50679_20230822.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='FeaLoss', + name='loss_fea', + use_this=fea, + student_channels=1024, + teacher_channels=1280, + alpha_fea=0.00007, + ) + ]), + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=0.1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +) +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py new file mode 100644 index 0000000..ce28745 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_x_dis_l_coco-ubody-256x192/dw-x-l_ucoco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py new file mode 100644 index 0000000..b049b50 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_x_dis_l_coco-ubody-384x288/dw-x-l_ucoco_384.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py new file mode 100644 index 0000000..3050f08 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_l_dis_m_coco-ubody-256x192/dw-l-m_ucoco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py new file mode 100644 index 0000000..83423d5 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_l_dis_s_coco-ubody-256x192/dw-l-s_ucoco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py new file mode 100644 index 0000000..be772a7 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py @@ -0,0 +1,45 @@ +_base_ = [ + '../../../rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501 +] + +# model settings +find_unused_parameters = True + +# dis settings +second_dis = True + +# config settings +logit = True + +train_cfg = dict(max_epochs=60, val_interval=10) + +# method details +model = dict( + _delete_=True, + type='DWPoseDistiller', + two_dis=second_dis, + teacher_pretrained='work_dirs/' + 'dwpose_l_dis_t_coco-ubody-256x192/dw-l-t_ucoco_256.pth', # noqa: E501 + teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/' + 'rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501 + distill_cfg=[ + dict(methods=[ + dict( + type='KDLoss', + name='loss_logit', + use_this=logit, + weight=1, + ) + ]), + ], + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + train_cfg=train_cfg, +) + +optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2)) diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000..bddca2c --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/README.md @@ -0,0 +1,18 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### COCO-WholeBody Dataset + +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Model | Input Size | Whole AP | Whole AR | Details and Download | +| :-------: | :--------: | :------: | :------: | :---------------------------------------------------------------------: | +| RTMPose-m | 256x192 | 0.582 | 0.674 | [rtmpose_coco-wholebody.md](./coco-wholebody/rtmpose_coco-wholebody.md) | +| RTMPose-l | 256x192 | 0.611 | 0.700 | [rtmpose_coco-wholebody.md](./coco-wholebody/rtmpose_coco-wholebody.md) | +| RTMPose-l | 384x288 | 0.648 | 0.730 | [rtmpose_coco-wholebody.md](./coco-wholebody/rtmpose_coco-wholebody.md) | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py new file mode 100644 index 0000000..a037573 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py @@ -0,0 +1,615 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (192, 256) + +# runtime +max_epochs = 270 +stage2_num_epochs = 10 +base_lr = 5e-4 +train_batch_size = 1024 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.1), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=8192) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth' # noqa + )), + neck=dict( + type='CSPNeXtPAFPN', + in_channels=[256, 512, 1024], + out_channels=None, + out_indices=( + 1, + 2, + ), + num_csp_blocks=2, + expand_ratio=0.5, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + head=dict( + type='RTMWHead', + in_channels=1024, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=1., + label_softmax=True, + label_beta=10., + mask=list(range(23, 91)), + mask_weight=0.5, + ), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping + +aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12), + (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)] + +crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)] + +mpii_coco133 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco133 = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco133 = [(i, i) + for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21), + (24, 19), + (25, 22)] + [(i, i - 3) + for i in range(26, 136)] + +posetrack_coco133 = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120), + (19, 17), (20, 20)] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_coco133) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_coco133) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_coco133) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_coco133) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_coco133) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_coco133) + ], +) + +dataset_humanart = dict( + type='HumanArt21Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart.json', + filter_cfg=dict(scenes=['real_human']), + data_prefix=dict(img='pose/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=humanart_coco133) + ]) + +ubody_scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +ubody_datasets = [] +for scene in ubody_scenes: + each = dict( + type='UBody2dDataset', + data_root=data_root, + data_mode=data_mode, + ann_file=f'Ubody/annotations/{scene}/train_annotations.json', + data_prefix=dict(img='pose/UBody/images/'), + pipeline=[], + sample_interval=10) + ubody_datasets.append(each) + +dataset_ubody = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'), + datasets=ubody_datasets, + pipeline=[], + test_mode=False, +) + +face_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale', padding=1.25), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +wflw_coco133 = [(i * 2, 23 + i) + for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [ + (42 + i, 45 + i) for i in range(5) + ] + [(51 + i, 50 + i) + for i in range(9)] + [(60, 59), (61, 60), (63, 61), + (64, 62), (65, 63), (67, 64), + (68, 65), (69, 66), (71, 67), + (72, 68), (73, 69), + (75, 70)] + [(76 + i, 71 + i) + for i in range(20)] +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=wflw_coco133), *face_pipeline + ], +) + +mapping_300w_coco133 = [(i, 23 + i) for i in range(68)] +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mapping_300w_coco133), *face_pipeline + ], +) + +cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59), + (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53), + (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89), + (27, 80), (28, 31)] +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=cofw_coco133), *face_pipeline + ], +) + +lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [ + (33 + i, 40 + i) for i in range(5) +] + [(42 + i, 45 + i) for i in range(5)] + [ + (51 + i, 50 + i) for i in range(4) +] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61), + (70, 62), (71, 63), (73, 64), + (75, 65), (76, 66), (78, 67), + (79, 68), (80, 69), + (82, 70)] + [(84 + i, 71 + i) + for i in range(20)] +dataset_lapa = dict( + type='LapaDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=lapa_coco133), *face_pipeline + ], +) + +dataset_wb = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_coco, dataset_halpe, dataset_ubody], + pipeline=[], + test_mode=False, +) + +dataset_body = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_posetrack, + dataset_humanart, + ], + pipeline=[], + test_mode=False, +) + +dataset_face = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_wflw, + dataset_300w, + dataset_cofw, + dataset_lapa, + ], + pipeline=[], + test_mode=False, +) + +hand_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98), + (27, 97), (28, 96), (29, 103), (30, 102), (31, 101), + (32, 100), (33, 107), (34, 106), (35, 105), (36, 104), + (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)] +interhand_right = [(i - 21, j + 21) for i, j in interhand_left] +interhand_coco133 = interhand_right + interhand_left + +dataset_interhand2d = dict( + type='InterHand2DDoubleDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json', + camera_param_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_camera.json', + joint_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_joint_3d.json', + data_prefix=dict(img='interhand2.6m/images/train/'), + sample_interval=10, + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=interhand_coco133, + ), *hand_pipeline + ], +) + +dataset_hand = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_interhand2d], + pipeline=[], + test_mode=False, +) + +train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=4, + pin_memory=False, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='data/detection/coco/val2017/'), + pipeline=val_pipeline, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + test_mode=True)) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py new file mode 100644 index 0000000..e095f3b --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py @@ -0,0 +1,617 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (288, 384) + +# runtime +max_epochs = 270 +stage2_num_epochs = 10 +base_lr = 5e-4 +train_batch_size = 320 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.1), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=2560) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False, + decode_visibility=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth' # noqa + )), + neck=dict( + type='CSPNeXtPAFPN', + in_channels=[256, 512, 1024], + out_channels=None, + out_indices=( + 1, + 2, + ), + num_csp_blocks=2, + expand_ratio=0.5, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + head=dict( + type='RTMWHead', + in_channels=1024, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=1., + label_softmax=True, + label_beta=10., + mask=list(range(23, 91)), + mask_weight=0.5, + ), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping + +aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12), + (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)] + +crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)] + +mpii_coco133 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco133 = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco133 = [(i, i) + for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21), + (24, 19), + (25, 22)] + [(i, i - 3) + for i in range(26, 136)] + +posetrack_coco133 = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120), + (19, 17), (20, 20)] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_coco133) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_coco133) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_coco133) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_coco133) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_coco133) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_coco133) + ], +) + +dataset_humanart = dict( + type='HumanArt21Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart.json', + filter_cfg=dict(scenes=['real_human']), + data_prefix=dict(img='pose/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=humanart_coco133) + ]) + +ubody_scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +ubody_datasets = [] +for scene in ubody_scenes: + each = dict( + type='UBody2dDataset', + data_root=data_root, + data_mode=data_mode, + ann_file=f'Ubody/annotations/{scene}/train_annotations.json', + data_prefix=dict(img='pose/UBody/images/'), + pipeline=[], + sample_interval=10) + ubody_datasets.append(each) + +dataset_ubody = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'), + datasets=ubody_datasets, + pipeline=[], + test_mode=False, +) + +face_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale', padding=1.25), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +wflw_coco133 = [(i * 2, 23 + i) + for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [ + (42 + i, 45 + i) for i in range(5) + ] + [(51 + i, 50 + i) + for i in range(9)] + [(60, 59), (61, 60), (63, 61), + (64, 62), (65, 63), (67, 64), + (68, 65), (69, 66), (71, 67), + (72, 68), (73, 69), + (75, 70)] + [(76 + i, 71 + i) + for i in range(20)] +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=wflw_coco133), *face_pipeline + ], +) + +mapping_300w_coco133 = [(i, 23 + i) for i in range(68)] +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mapping_300w_coco133), *face_pipeline + ], +) + +cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59), + (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53), + (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89), + (27, 80), (28, 31)] +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=cofw_coco133), *face_pipeline + ], +) + +lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [ + (33 + i, 40 + i) for i in range(5) +] + [(42 + i, 45 + i) for i in range(5)] + [ + (51 + i, 50 + i) for i in range(4) +] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61), + (70, 62), (71, 63), (73, 64), + (75, 65), (76, 66), (78, 67), + (79, 68), (80, 69), + (82, 70)] + [(84 + i, 71 + i) + for i in range(20)] +dataset_lapa = dict( + type='LapaDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=lapa_coco133), *face_pipeline + ], +) + +dataset_wb = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_coco, dataset_halpe, dataset_ubody], + pipeline=[], + test_mode=False, +) + +dataset_body = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_posetrack, + dataset_humanart, + ], + pipeline=[], + test_mode=False, +) + +dataset_face = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_wflw, + dataset_300w, + dataset_cofw, + dataset_lapa, + ], + pipeline=[], + test_mode=False, +) + +hand_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98), + (27, 97), (28, 96), (29, 103), (30, 102), (31, 101), + (32, 100), (33, 107), (34, 106), (35, 105), (36, 104), + (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)] +interhand_right = [(i - 21, j + 21) for i, j in interhand_left] +interhand_coco133 = interhand_right + interhand_left + +dataset_interhand2d = dict( + type='InterHand2DDoubleDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json', + camera_param_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_camera.json', + joint_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_joint_3d.json', + data_prefix=dict(img='interhand2.6m/images/train/'), + sample_interval=10, + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=interhand_coco133, + ), *hand_pipeline + ], +) + +dataset_hand = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_interhand2d], + pipeline=[], + test_mode=False, +) + +train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=4, + pin_memory=False, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='data/detection/coco/val2017/'), + pipeline=val_pipeline, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + test_mode=True)) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py new file mode 100644 index 0000000..cec4872 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py @@ -0,0 +1,615 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (192, 256) + +# runtime +max_epochs = 270 +stage2_num_epochs = 10 +base_lr = 5e-4 +train_batch_size = 1024 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=8192) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/rtmpose-m_simcc-ucoco_dw-ucoco_270e-256x192-c8b76419_20230728.pth' # noqa + )), + neck=dict( + type='CSPNeXtPAFPN', + in_channels=[192, 384, 768], + out_channels=None, + out_indices=( + 1, + 2, + ), + num_csp_blocks=2, + expand_ratio=0.5, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + head=dict( + type='RTMWHead', + in_channels=768, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=1., + label_softmax=True, + label_beta=10., + mask=list(range(23, 91)), + mask_weight=0.5, + ), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping + +aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12), + (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)] + +crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)] + +mpii_coco133 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco133 = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco133 = [(i, i) + for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21), + (24, 19), + (25, 22)] + [(i, i - 3) + for i in range(26, 136)] + +posetrack_coco133 = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120), + (19, 17), (20, 20)] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_coco133) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_coco133) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_coco133) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_coco133) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_coco133) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_coco133) + ], +) + +dataset_humanart = dict( + type='HumanArt21Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart.json', + filter_cfg=dict(scenes=['real_human']), + data_prefix=dict(img='pose/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=humanart_coco133) + ]) + +ubody_scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +ubody_datasets = [] +for scene in ubody_scenes: + each = dict( + type='UBody2dDataset', + data_root=data_root, + data_mode=data_mode, + ann_file=f'Ubody/annotations/{scene}/train_annotations.json', + data_prefix=dict(img='pose/UBody/images/'), + pipeline=[], + sample_interval=10) + ubody_datasets.append(each) + +dataset_ubody = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'), + datasets=ubody_datasets, + pipeline=[], + test_mode=False, +) + +face_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale', padding=1.25), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +wflw_coco133 = [(i * 2, 23 + i) + for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [ + (42 + i, 45 + i) for i in range(5) + ] + [(51 + i, 50 + i) + for i in range(9)] + [(60, 59), (61, 60), (63, 61), + (64, 62), (65, 63), (67, 64), + (68, 65), (69, 66), (71, 67), + (72, 68), (73, 69), + (75, 70)] + [(76 + i, 71 + i) + for i in range(20)] +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=wflw_coco133), *face_pipeline + ], +) + +mapping_300w_coco133 = [(i, 23 + i) for i in range(68)] +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mapping_300w_coco133), *face_pipeline + ], +) + +cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59), + (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53), + (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89), + (27, 80), (28, 31)] +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=cofw_coco133), *face_pipeline + ], +) + +lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [ + (33 + i, 40 + i) for i in range(5) +] + [(42 + i, 45 + i) for i in range(5)] + [ + (51 + i, 50 + i) for i in range(4) +] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61), + (70, 62), (71, 63), (73, 64), + (75, 65), (76, 66), (78, 67), + (79, 68), (80, 69), + (82, 70)] + [(84 + i, 71 + i) + for i in range(20)] +dataset_lapa = dict( + type='LapaDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=lapa_coco133), *face_pipeline + ], +) + +dataset_wb = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_coco, dataset_halpe, dataset_ubody], + pipeline=[], + test_mode=False, +) + +dataset_body = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_posetrack, + dataset_humanart, + ], + pipeline=[], + test_mode=False, +) + +dataset_face = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_wflw, + dataset_300w, + dataset_cofw, + dataset_lapa, + ], + pipeline=[], + test_mode=False, +) + +hand_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98), + (27, 97), (28, 96), (29, 103), (30, 102), (31, 101), + (32, 100), (33, 107), (34, 106), (35, 105), (36, 104), + (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)] +interhand_right = [(i - 21, j + 21) for i, j in interhand_left] +interhand_coco133 = interhand_right + interhand_left + +dataset_interhand2d = dict( + type='InterHand2DDoubleDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json', + camera_param_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_camera.json', + joint_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_joint_3d.json', + data_prefix=dict(img='interhand2.6m/images/train/'), + sample_interval=10, + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=interhand_coco133, + ), *hand_pipeline + ], +) + +dataset_hand = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_interhand2d], + pipeline=[], + test_mode=False, +) + +train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=4, + pin_memory=False, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='data/detection/coco/val2017/'), + pipeline=val_pipeline, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + test_mode=True)) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py new file mode 100644 index 0000000..e5f1754 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py @@ -0,0 +1,617 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (288, 384) + +# runtime +max_epochs = 270 +stage2_num_epochs = 10 +base_lr = 5e-4 +train_batch_size = 320 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.1), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=2560) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False, + decode_visibility=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/' + 'wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth' # noqa + )), + neck=dict( + type='CSPNeXtPAFPN', + in_channels=[320, 640, 1280], + out_channels=None, + out_indices=( + 1, + 2, + ), + num_csp_blocks=2, + expand_ratio=0.5, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + head=dict( + type='RTMWHead', + in_channels=1280, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=1., + label_softmax=True, + label_beta=10., + mask=list(range(23, 91)), + mask_weight=0.5, + ), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping + +aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12), + (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)] + +crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)] + +mpii_coco133 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco133 = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco133 = [(i, i) + for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21), + (24, 19), + (25, 22)] + [(i, i - 3) + for i in range(26, 136)] + +posetrack_coco133 = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120), + (19, 17), (20, 20)] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_coco133) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_coco133) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_coco133) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_coco133) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_coco133) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_coco133) + ], +) + +dataset_humanart = dict( + type='HumanArt21Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart.json', + filter_cfg=dict(scenes=['real_human']), + data_prefix=dict(img='pose/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=humanart_coco133) + ]) + +ubody_scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +ubody_datasets = [] +for scene in ubody_scenes: + each = dict( + type='UBody2dDataset', + data_root=data_root, + data_mode=data_mode, + ann_file=f'Ubody/annotations/{scene}/train_annotations.json', + data_prefix=dict(img='pose/UBody/images/'), + pipeline=[], + sample_interval=10) + ubody_datasets.append(each) + +dataset_ubody = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'), + datasets=ubody_datasets, + pipeline=[], + test_mode=False, +) + +face_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale', padding=1.25), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +wflw_coco133 = [(i * 2, 23 + i) + for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [ + (42 + i, 45 + i) for i in range(5) + ] + [(51 + i, 50 + i) + for i in range(9)] + [(60, 59), (61, 60), (63, 61), + (64, 62), (65, 63), (67, 64), + (68, 65), (69, 66), (71, 67), + (72, 68), (73, 69), + (75, 70)] + [(76 + i, 71 + i) + for i in range(20)] +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=wflw_coco133), *face_pipeline + ], +) + +mapping_300w_coco133 = [(i, 23 + i) for i in range(68)] +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mapping_300w_coco133), *face_pipeline + ], +) + +cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59), + (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53), + (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89), + (27, 80), (28, 31)] +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=cofw_coco133), *face_pipeline + ], +) + +lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [ + (33 + i, 40 + i) for i in range(5) +] + [(42 + i, 45 + i) for i in range(5)] + [ + (51 + i, 50 + i) for i in range(4) +] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61), + (70, 62), (71, 63), (73, 64), + (75, 65), (76, 66), (78, 67), + (79, 68), (80, 69), + (82, 70)] + [(84 + i, 71 + i) + for i in range(20)] +dataset_lapa = dict( + type='LapaDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=lapa_coco133), *face_pipeline + ], +) + +dataset_wb = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_coco, dataset_halpe, dataset_ubody], + pipeline=[], + test_mode=False, +) + +dataset_body = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_posetrack, + dataset_humanart, + ], + pipeline=[], + test_mode=False, +) + +dataset_face = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_wflw, + dataset_300w, + dataset_cofw, + dataset_lapa, + ], + pipeline=[], + test_mode=False, +) + +hand_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98), + (27, 97), (28, 96), (29, 103), (30, 102), (31, 101), + (32, 100), (33, 107), (34, 106), (35, 105), (36, 104), + (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)] +interhand_right = [(i - 21, j + 21) for i, j in interhand_left] +interhand_coco133 = interhand_right + interhand_left + +dataset_interhand2d = dict( + type='InterHand2DDoubleDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json', + camera_param_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_camera.json', + joint_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_joint_3d.json', + data_prefix=dict(img='interhand2.6m/images/train/'), + sample_interval=10, + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=interhand_coco133, + ), *hand_pipeline + ], +) + +dataset_hand = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_interhand2d], + pipeline=[], + test_mode=False, +) + +train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=4, + pin_memory=False, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='data/detection/coco/val2017/'), + pipeline=val_pipeline, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + test_mode=True)) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py new file mode 100644 index 0000000..d9852cf --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py @@ -0,0 +1,615 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (192, 256) + +# runtime +max_epochs = 270 +stage2_num_epochs = 10 +base_lr = 5e-4 +train_batch_size = 704 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.1), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=5632) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/' + 'wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-256x192-05f5bcb7_20230822.pth' # noqa + )), + neck=dict( + type='CSPNeXtPAFPN', + in_channels=[320, 640, 1280], + out_channels=None, + out_indices=( + 1, + 2, + ), + num_csp_blocks=2, + expand_ratio=0.5, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + head=dict( + type='RTMWHead', + in_channels=1280, + out_channels=num_keypoints, + input_size=input_size, + in_featuremap_size=tuple([s // 32 for s in input_size]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=1., + label_softmax=True, + label_beta=10., + mask=list(range(23, 91)), + mask_weight=0.5, + ), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PhotometricDistortion'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + ]), + dict( + type='GenerateTarget', + encoder=codec, + use_dataset_keypoint_weights=True), + dict(type='PackPoseInputs') +] + +# mapping + +aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12), + (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)] + +crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)] + +mpii_coco133 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco133 = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco133 = [(i, i) + for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21), + (24, 19), + (25, 22)] + [(i, i - 3) + for i in range(26, 136)] + +posetrack_coco133 = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120), + (19, 17), (20, 20)] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_coco133) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_coco133) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_coco133) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_coco133) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_coco133) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_coco133) + ], +) + +dataset_humanart = dict( + type='HumanArt21Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart.json', + filter_cfg=dict(scenes=['real_human']), + data_prefix=dict(img='pose/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=humanart_coco133) + ]) + +ubody_scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +ubody_datasets = [] +for scene in ubody_scenes: + each = dict( + type='UBody2dDataset', + data_root=data_root, + data_mode=data_mode, + ann_file=f'Ubody/annotations/{scene}/train_annotations.json', + data_prefix=dict(img='pose/UBody/images/'), + pipeline=[], + sample_interval=10) + ubody_datasets.append(each) + +dataset_ubody = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'), + datasets=ubody_datasets, + pipeline=[], + test_mode=False, +) + +face_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale', padding=1.25), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +wflw_coco133 = [(i * 2, 23 + i) + for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [ + (42 + i, 45 + i) for i in range(5) + ] + [(51 + i, 50 + i) + for i in range(9)] + [(60, 59), (61, 60), (63, 61), + (64, 62), (65, 63), (67, 64), + (68, 65), (69, 66), (71, 67), + (72, 68), (73, 69), + (75, 70)] + [(76 + i, 71 + i) + for i in range(20)] +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=wflw_coco133), *face_pipeline + ], +) + +mapping_300w_coco133 = [(i, 23 + i) for i in range(68)] +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mapping_300w_coco133), *face_pipeline + ], +) + +cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59), + (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53), + (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89), + (27, 80), (28, 31)] +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=cofw_coco133), *face_pipeline + ], +) + +lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [ + (33 + i, 40 + i) for i in range(5) +] + [(42 + i, 45 + i) for i in range(5)] + [ + (51 + i, 50 + i) for i in range(4) +] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61), + (70, 62), (71, 63), (73, 64), + (75, 65), (76, 66), (78, 67), + (79, 68), (80, 69), + (82, 70)] + [(84 + i, 71 + i) + for i in range(20)] +dataset_lapa = dict( + type='LapaDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=lapa_coco133), *face_pipeline + ], +) + +dataset_wb = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_coco, dataset_halpe, dataset_ubody], + pipeline=[], + test_mode=False, +) + +dataset_body = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_posetrack, + dataset_humanart, + ], + pipeline=[], + test_mode=False, +) + +dataset_face = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_wflw, + dataset_300w, + dataset_cofw, + dataset_lapa, + ], + pipeline=[], + test_mode=False, +) + +hand_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98), + (27, 97), (28, 96), (29, 103), (30, 102), (31, 101), + (32, 100), (33, 107), (34, 106), (35, 105), (36, 104), + (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)] +interhand_right = [(i - 21, j + 21) for i, j in interhand_left] +interhand_coco133 = interhand_right + interhand_left + +dataset_interhand2d = dict( + type='InterHand2DDoubleDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json', + camera_param_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_camera.json', + joint_file='interhand26m/annotations/all/' + 'InterHand2.6M_train_joint_3d.json', + data_prefix=dict(img='interhand2.6m/images/train/'), + sample_interval=10, + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=interhand_coco133, + ), *hand_pipeline + ], +) + +dataset_hand = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_interhand2d], + pipeline=[], + test_mode=False, +) + +train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=4, + pin_memory=False, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='data/detection/coco/val2017/'), + pipeline=val_pipeline, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + test_mode=True)) + +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.md b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.md new file mode 100644 index 0000000..678bc64 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.md @@ -0,0 +1,80 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +- `Cocktail14` denotes model trained on 14 public datasets: + - [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic) + - [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose) + - [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii) + - [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset) + - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe) + - [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18) + - [COCO-Wholebody](https://github.com/jin-s13/COCO-WholeBody/) + - [UBody](https://github.com/IDEA-Research/OSX) + - [Human-Art](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#human-art-dataset) + - [WFLW](https://wywu.github.io/projects/LAB/WFLW.html) + - [300W](https://ibug.doc.ic.ac.uk/resources/300-W/) + - [COFW](http://www.vision.caltech.edu/xpburgos/ICCV13/) + - [LaPa](https://github.com/JDAI-CV/lapa-dataset) + - [InterHand](https://mks0601.github.io/InterHand2.6M/) + +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------------------------: | :-: | +| [rtmw-m](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py) | 256x192 | 0.676 | 0.747 | 0.671 | 0.794 | 0.783 | 0.854 | 0.491 | 0.604 | 0.582 | 0.673 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-l-m_simcc-cocktail14_270e-256x192-20231122.pth) | - | +| [rtmw-l](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py) | 256x192 | 0.743 | 0.807 | 0.763 | 0.868 | 0.834 | 0.889 | 0.598 | 0.701 | 0.660 | 0.746 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-x-l_simcc-cocktail14_270e-256x192-20231122.pth) | - | +| [rtmw-x](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py) | 256x192 | 0.746 | 0.808 | 0.770 | 0.869 | 0.844 | 0.896 | 0.610 | 0.710 | 0.672 | 0.752 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail14_pt-ucoco_270e-256x192-13a2546d_20231208.pth) | - | +| [rtmw-l](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py) | 384x288 | 0.761 | 0.824 | 0.793 | 0.885 | 0.884 | 0.921 | 0.663 | 0.752 | 0.701 | 0.780 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-x-l_simcc-cocktail14_270e-384x288-20231122.pth) | - | +| [rtmw-x](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py) | 384x288 | 0.763 | 0.826 | 0.796 | 0.888 | 0.884 | 0.923 | 0.664 | 0.755 | 0.702 | 0.781 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail14_pt-ucoco_270e-384x288-f840f204_20231122.pth) | - | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.yml new file mode 100644 index 0000000..4d84c88 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.yml @@ -0,0 +1,108 @@ +Models: +- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py + In Collection: RTMPose + Alias: wholebody + Metadata: + Architecture: &id001 + - RTMPose + Training Data: COCO-WholeBody + Name: rtmw-m_8xb1024-270e_cocktail14-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.676 + Body AR: 0.747 + Face AP: 0.783 + Face AR: 0.854 + Foot AP: 0.671 + Foot AR: 0.794 + Hand AP: 0.491 + Hand AR: 0.604 + Whole AP: 0.582 + Whole AR: 0.673 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-l-m_simcc-cocktail14_270e-256x192-20231122.pth +- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: rtmw-l_8xb1024-270e_cocktail14-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.743 + Body AR: 0.807 + Face AP: 0.834 + Face AR: 0.889 + Foot AP: 0.763 + Foot AR: 0.868 + Hand AP: 0.598 + Hand AR: 0.701 + Whole AP: 0.660 + Whole AR: 0.746 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-x-l_simcc-cocktail14_270e-256x192-20231122.pth +- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: rtmw-x_8xb704-270e_cocktail14-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.746 + Body AR: 0.808 + Face AP: 0.844 + Face AR: 0.896 + Foot AP: 0.770 + Foot AR: 0.869 + Hand AP: 0.610 + Hand AR: 0.710 + Whole AP: 0.672 + Whole AR: 0.752 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail14_pt-ucoco_270e-256x192-13a2546d_20231208.pth +- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: rtmw-l_8xb320-270e_cocktail14-384x288 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.761 + Body AR: 0.824 + Face AP: 0.884 + Face AR: 0.921 + Foot AP: 0.793 + Foot AR: 0.885 + Hand AP: 0.663 + Hand AR: 0.752 + Whole AP: 0.701 + Whole AR: 0.780 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-x-l_simcc-cocktail14_270e-384x288-20231122.pth +- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: rtmw-x_8xb320-270e_cocktail14-384x288 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.763 + Body AR: 0.826 + Face AP: 0.884 + Face AR: 0.923 + Foot AP: 0.796 + Foot AR: 0.888 + Hand AP: 0.664 + Hand AR: 0.755 + Whole AP: 0.702 + Whole AR: 0.781 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail14_pt-ucoco_270e-384x288-f840f204_20231122.pth diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py new file mode 100644 index 0000000..9c881a9 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py new file mode 100644 index 0000000..5c57c56 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py new file mode 100644 index 0000000..2abf14b --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py new file mode 100644 index 0000000..7831708 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py @@ -0,0 +1,233 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (288, 384) + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 32 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1280, + out_channels=num_keypoints, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.md new file mode 100644 index 0000000..93c8434 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.md @@ -0,0 +1,62 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [rtmpose-m](/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 0.673 | 0.750 | 0.615 | 0.752 | 0.813 | 0.871 | 0.475 | 0.589 | 0.582 | 0.674 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.json) | +| [rtmpose-l](/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 0.695 | 0.769 | 0.658 | 0.785 | 0.833 | 0.887 | 0.519 | 0.628 | 0.611 | 0.700 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.json) | +| [rtmpose-l](/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) | 384x288 | 0.712 | 0.781 | 0.693 | 0.811 | 0.882 | 0.919 | 0.579 | 0.677 | 0.648 | 0.730 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.json) | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.yml new file mode 100644 index 0000000..e2f6b51 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.yml @@ -0,0 +1,66 @@ +Models: +- Config: configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py + In Collection: RTMPose + Alias: wholebody + Metadata: + Architecture: &id001 + - RTMPose + Training Data: COCO-WholeBody + Name: rtmpose-m_8xb64-270e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.673 + Body AR: 0.750 + Face AP: 0.813 + Face AR: 0.871 + Foot AP: 0.615 + Foot AR: 0.752 + Hand AP: 0.475 + Hand AR: 0.589 + Whole AP: 0.582 + Whole AR: 0.674 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.pth +- Config: configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: rtmpose-l_8xb64-270e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.695 + Body AR: 0.769 + Face AP: 0.833 + Face AR: 0.887 + Foot AP: 0.658 + Foot AR: 0.785 + Hand AP: 0.519 + Hand AR: 0.628 + Whole AP: 0.611 + Whole AR: 0.700 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth +- Config: configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py + In Collection: RTMPose + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: rtmpose-l_8xb32-270e_coco-wholebody-384x288 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.712 + Body AR: 0.781 + Face AP: 0.882 + Face AR: 0.919 + Foot AP: 0.693 + Foot AR: 0.811 + Hand AP: 0.579 + Hand AR: 0.677 + Whole AP: 0.648 + Whole AR: 0.730 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.pth diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py new file mode 100644 index 0000000..e8b4bfb --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 32 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(9, 12), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000..fb7f783 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000..6fe5fc4 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000..1620793 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000..fb90798 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,256 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py new file mode 100644 index 0000000..71330c1 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py @@ -0,0 +1,260 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (288, 384) + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 32 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=input_size, + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1280, + out_channels=num_keypoints, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py new file mode 100644 index 0000000..44d13e2 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py @@ -0,0 +1,260 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# common setting +num_keypoints = 133 +input_size = (192, 256) + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 +train_batch_size = 64 +val_batch_size = 32 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + clip_grad=dict(max_norm=35, norm_type=2), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa: E501 + )), + head=dict( + type='RTMCCHead', + in_channels=1280, + out_channels=num_keypoints, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +backend_args = dict(backend='local') + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=train_batch_size, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) + +val_dataloader = dict( + batch_size=val_batch_size, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/README.md new file mode 100644 index 0000000..c60db05 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/README.md @@ -0,0 +1,35 @@ +# Top-down heatmap-based pose estimation + +Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html). + +
+ +
+ +## Results and Models + +### COCO-WholeBody Dataset + +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Model | Input Size | Whole AP | Whole AR | Details and Download | +| :-----------------: | :--------: | :------: | :------: | :-----------------------------------------------------------------------------: | +| HRNet-w48+Dark+ | 384x288 | 0.661 | 0.743 | [hrnet_dark_coco-wholebody.md](./coco-wholebody/hrnet_dark_coco-wholebody.md) | +| HRNet-w32+Dark | 256x192 | 0.582 | 0.671 | [hrnet_dark_coco-wholebody.md](./coco-wholebody/hrnet_dark_coco-wholebody.md) | +| HRNet-w48 | 256x192 | 0.579 | 0.681 | [hrnet_coco-wholebody.md](./coco-wholebody/hrnet_coco-wholebody.md) | +| CSPNeXt-m | 256x192 | 0.567 | 0.641 | [cspnext_udp_coco-wholebody.md](./coco-wholebody/cspnext_udp_coco-wholebody.md) | +| HRNet-w32 | 256x192 | 0.549 | 0.646 | [hrnet_ubody-coco-wholebody.md](./ubody2d/hrnet_ubody-coco-wholebody.md) | +| ResNet-152 | 256x192 | 0.548 | 0.661 | [resnet_coco-wholebody.md](./coco-wholebody/resnet_coco-wholebody.md) | +| HRNet-w32 | 256x192 | 0.536 | 0.636 | [hrnet_coco-wholebody.md](./coco-wholebody/hrnet_coco-wholebody.md) | +| ResNet-101 | 256x192 | 0.531 | 0.645 | [resnet_coco-wholebody.md](./coco-wholebody/resnet_coco-wholebody.md) | +| S-ViPNAS-Res50+Dark | 256x192 | 0.528 | 0.632 | [vipnas_dark_coco-wholebody.md](./coco-wholebody/vipnas_dark_coco-wholebody.md) | +| ResNet-50 | 256x192 | 0.521 | 0.633 | [resnet_coco-wholebody.md](./coco-wholebody/resnet_coco-wholebody.md) | +| S-ViPNAS-Res50 | 256x192 | 0.495 | 0.607 | [vipnas_coco-wholebody.md](./coco-wholebody/vipnas_coco-wholebody.md) | + +### UBody2D Dataset + +Result on UBody val set, computed with gt keypoints. + +| Model | Input Size | Whole AP | Whole AR | Details and Download | +| :-------: | :--------: | :------: | :------: | :----------------------------------------------------------------------: | +| HRNet-w32 | 256x192 | 0.690 | 0.729 | [hrnet_ubody-coco-wholebody.md](./ubody2d/hrnet_ubody-coco-wholebody.md) | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-l_udp_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-l_udp_8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..aa98e5a --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-l_udp_8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,212 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..d6d1c2f --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,212 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +backend_args = dict(backend='local') +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.md new file mode 100644 index 0000000..7f8e000 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.md @@ -0,0 +1,56 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [pose_cspnext_m_udp](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.687 | 0.735 | 0.680 | 0.763 | 0.697 | 0.755 | 0.460 | 0.543 | 0.567 | 0.641 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco-wholebody_pt-in1k_210e-256x192-320fa258_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco-wholebody_pt-in1k_210e-256x192-320fa258_20230123.json) | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.yml new file mode 100644 index 0000000..bdcb4c5 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.yml @@ -0,0 +1,24 @@ +Models: +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py + In Collection: UDP + Metadata: + Architecture: &id001 + - UDP + - CSPNeXt + Training Data: COCO-WholeBody + Name: cspnext-m_udp_8xb64-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.687 + Body AR: 0.735 + Face AP: 0.697 + Face AR: 0.755 + Foot AP: 0.680 + Foot AR: 0.763 + Hand AP: 0.46 + Hand AR: 0.567 + Whole AP: 0.567 + Whole AR: 0.641 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco-wholebody_pt-in1k_210e-256x192-320fa258_20230123.pth diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.md new file mode 100644 index 0000000..8dd01d5 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.md @@ -0,0 +1,41 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [pose_hrnet_w32](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.678 | 0.755 | 0.543 | 0.661 | 0.630 | 0.708 | 0.467 | 0.566 | 0.536 | 0.636 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192-853765cd_20200918.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_20200918.log.json) | +| [pose_hrnet_w32](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py) | 384x288 | 0.700 | 0.772 | 0.585 | 0.691 | 0.726 | 0.783 | 0.515 | 0.603 | 0.586 | 0.673 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_384x288-78cacac3_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_384x288_20200922.log.json) | +| [pose_hrnet_w48](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py) | 256x192 | 0.701 | 0.776 | 0.675 | 0.787 | 0.656 | 0.743 | 0.535 | 0.639 | 0.579 | 0.681 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_256x192-643e18cb_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_256x192_20200922.log.json) | +| [pose_hrnet_w48](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py) | 384x288 | 0.722 | 0.791 | 0.696 | 0.801 | 0.776 | 0.834 | 0.587 | 0.678 | 0.632 | 0.717 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288-6e061c6a_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_20200922.log.json) | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml new file mode 100644 index 0000000..2cee2ac --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml @@ -0,0 +1,86 @@ +Models: +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py + In Collection: HRNet + Metadata: + Architecture: &id001 + - HRNet + Training Data: COCO-WholeBody + Name: td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.678 + Body AR: 0.755 + Face AP: 0.630 + Face AR: 0.708 + Foot AP: 0.543 + Foot AR: 0.661 + Hand AP: 0.467 + Hand AR: 0.566 + Whole AP: 0.536 + Whole AR: 0.636 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192-853765cd_20200918.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.700 + Body AR: 0.772 + Face AP: 0.726 + Face AR: 0.783 + Foot AP: 0.585 + Foot AR: 0.691 + Hand AP: 0.515 + Hand AR: 0.603 + Whole AP: 0.586 + Whole AR: 0.673 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_384x288-78cacac3_20200922.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.701 + Body AR: 0.776 + Face AP: 0.656 + Face AR: 0.743 + Foot AP: 0.675 + Foot AR: 0.787 + Hand AP: 0.535 + Hand AR: 0.639 + Whole AP: 0.579 + Whole AR: 0.681 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_256x192-643e18cb_20200922.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py + In Collection: HRNet + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.722 + Body AR: 0.791 + Face AP: 0.776 + Face AR: 0.834 + Foot AP: 0.696 + Foot AR: 0.801 + Hand AP: 0.587 + Hand AR: 0.678 + Whole AP: 0.632 + Whole AR: 0.717 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288-6e061c6a_20200922.pth diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.md new file mode 100644 index 0000000..fa4bc27 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.md @@ -0,0 +1,58 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [pose_hrnet_w32_dark](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.693 | 0.764 | 0.564 | 0.674 | 0.737 | 0.809 | 0.503 | 0.602 | 0.582 | 0.671 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_dark-469327ef_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_dark_20200922.log.json) | +| [pose_hrnet_w48_dark+](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py) | 384x288 | 0.742 | 0.807 | 0.707 | 0.806 | 0.841 | 0.892 | 0.602 | 0.694 | 0.661 | 0.743 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark-f5726563_20200918.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark_20200918.log.json) | + +Note: `+` means the model is first pre-trained on original COCO dataset, and then fine-tuned on COCO-WholeBody dataset. We find this will lead to better performance. diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml new file mode 100644 index 0000000..25a22cc --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml @@ -0,0 +1,45 @@ +Models: +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py + In Collection: DarkPose + Metadata: + Architecture: &id001 + - HRNet + - DarkPose + Training Data: COCO-WholeBody + Name: td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.693 + Body AR: 0.764 + Face AP: 0.737 + Face AR: 0.809 + Foot AP: 0.564 + Foot AR: 0.674 + Hand AP: 0.503 + Hand AR: 0.602 + Whole AP: 0.582 + Whole AR: 0.671 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_dark-469327ef_20200922.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py + In Collection: DarkPose + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.742 + Body AR: 0.807 + Face AP: 0.841 + Face AR: 0.892 + Foot AP: 0.707 + Foot AR: 0.806 + Hand AP: 0.602 + Hand AR: 0.694 + Whole AP: 0.661 + Whole AR: 0.743 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark-f5726563_20200918.pth diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.md new file mode 100644 index 0000000..187e5d3 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.md @@ -0,0 +1,43 @@ + + +
+SimpleBaseline2D (ECCV'2018) + +```bibtex +@inproceedings{xiao2018simple, + title={Simple baselines for human pose estimation and tracking}, + author={Xiao, Bin and Wu, Haiping and Wei, Yichen}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={466--481}, + year={2018} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [pose_resnet_50](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.652 | 0.738 | 0.615 | 0.749 | 0.606 | 0.715 | 0.460 | 0.584 | 0.521 | 0.633 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_256x192-9e37ed88_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_256x192_20201004.log.json) | +| [pose_resnet_50](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py) | 384x288 | 0.666 | 0.747 | 0.634 | 0.763 | 0.731 | 0.811 | 0.536 | 0.646 | 0.574 | 0.670 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_384x288-ce11e294_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_384x288_20201004.log.json) | +| [pose_resnet_101](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py) | 256x192 | 0.669 | 0.753 | 0.637 | 0.766 | 0.611 | 0.722 | 0.463 | 0.589 | 0.531 | 0.645 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_256x192-7325f982_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_256x192_20201004.log.json) | +| [pose_resnet_101](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py) | 384x288 | 0.692 | 0.770 | 0.680 | 0.799 | 0.746 | 0.820 | 0.548 | 0.657 | 0.597 | 0.693 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_384x288-6c137b9a_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_384x288_20201004.log.json) | +| [pose_resnet_152](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py) | 256x192 | 0.682 | 0.764 | 0.661 | 0.787 | 0.623 | 0.728 | 0.481 | 0.607 | 0.548 | 0.661 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_256x192-5de8ae23_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_256x192_20201004.log.json) | +| [pose_resnet_152](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py) | 384x288 | 0.704 | 0.780 | 0.693 | 0.813 | 0.751 | 0.824 | 0.559 | 0.666 | 0.610 | 0.705 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_384x288-eab8caa8_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_384x288_20201004.log.json) | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml new file mode 100644 index 0000000..c4c148a --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml @@ -0,0 +1,128 @@ +Models: +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: &id001 + - SimpleBaseline2D + Training Data: COCO-WholeBody + Name: td-hm_res50_8xb64-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.652 + Body AR: 0.738 + Face AP: 0.606 + Face AR: 0.715 + Foot AP: 0.615 + Foot AR: 0.749 + Hand AP: 0.46 + Hand AR: 0.584 + Whole AP: 0.521 + Whole AR: 0.633 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_256x192-9e37ed88_20201004.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_res50_8xb64-210e_coco-wholebody-384x288 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.666 + Body AR: 0.747 + Face AP: 0.731 + Face AR: 0.811 + Foot AP: 0.634 + Foot AR: 0.763 + Hand AP: 0.536 + Hand AR: 0.646 + Whole AP: 0.574 + Whole AR: 0.67 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_384x288-ce11e294_20201004.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_res101_8xb32-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.669 + Body AR: 0.753 + Face AP: 0.611 + Face AR: 0.722 + Foot AP: 0.637 + Foot AR: 0.766 + Hand AP: 0.463 + Hand AR: 0.589 + Whole AP: 0.531 + Whole AR: 0.645 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_256x192-7325f982_20201004.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_res101_8xb32-210e_coco-wholebody-384x288 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.692 + Body AR: 0.77 + Face AP: 0.746 + Face AR: 0.82 + Foot AP: 0.68 + Foot AR: 0.799 + Hand AP: 0.548 + Hand AR: 0.657 + Whole AP: 0.598 + Whole AR: 0.691 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_384x288-6c137b9a_20201004.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_res152_8xb32-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.682 + Body AR: 0.764 + Face AP: 0.623 + Face AR: 0.728 + Foot AP: 0.661 + Foot AR: 0.787 + Hand AP: 0.481 + Hand AR: 0.607 + Whole AP: 0.548 + Whole AR: 0.661 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_256x192-5de8ae23_20201004.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_res152_8xb32-210e_coco-wholebody-384x288 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.704 + Body AR: 0.78 + Face AP: 0.751 + Face AR: 0.824 + Foot AP: 0.693 + Foot AR: 0.813 + Hand AP: 0.559 + Hand AR: 0.666 + Whole AP: 0.61 + Whole AR: 0.705 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_384x288-eab8caa8_20201004.pth diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..339581e --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=133, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py new file mode 100644 index 0000000..677fdc7 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=133, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..3717831 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=133, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..5c53b7c --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=133, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py new file mode 100644 index 0000000..ef25d2a --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py @@ -0,0 +1,150 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=133, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py new file mode 100644 index 0000000..d77872c --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(288, 384), + heatmap_size=(72, 96), + sigma=3, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=48, + out_channels=133, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..87c273c --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py new file mode 100644 index 0000000..5e58a16 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..3ce4936 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py new file mode 100644 index 0000000..a92c4d2 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=152, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..127c322 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py new file mode 100644 index 0000000..88a88e2 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py @@ -0,0 +1,121 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=50, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + ), + head=dict( + type='HeatmapHead', + in_channels=2048, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..b39adf9 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,122 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ViPNAS_MobileNetV3'), + head=dict( + type='ViPNASHead', + in_channels=160, + out_channels=133, + deconv_out_channels=(160, 160, 160), + deconv_num_groups=(160, 160, 160), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..851c04a --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,126 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict(type='ViPNAS_MobileNetV3'), + head=dict( + type='ViPNASHead', + in_channels=160, + out_channels=133, + deconv_out_channels=(160, 160, 160), + deconv_num_groups=(160, 160, 160), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..24c7578 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,123 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ViPNAS_ResNet', + depth=50, + ), + head=dict( + type='ViPNASHead', + in_channels=608, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000..585e3dc --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,127 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2, + unbiased=True) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ViPNAS_ResNet', + depth=50, + ), + head=dict( + type='ViPNASHead', + in_channels=608, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + rotate_factor=60, + scale_factor=(0.75, 1.25)), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md new file mode 100644 index 0000000..13b0321 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md @@ -0,0 +1,38 @@ + + +
+ViPNAS (CVPR'2021) + +```bibtex +@article{xu2021vipnas, + title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search}, + author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + year={2021} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [S-ViPNAS-MobileNetV3](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.619 | 0.700 | 0.477 | 0.608 | 0.585 | 0.689 | 0.386 | 0.505 | 0.473 | 0.578 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192-0fee581a_20211205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_20211205.log.json) | +| [S-ViPNAS-Res50](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.643 | 0.726 | 0.553 | 0.694 | 0.587 | 0.698 | 0.410 | 0.529 | 0.495 | 0.607 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192-49e1c3a4_20211112.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_20211112.log.json) | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml new file mode 100644 index 0000000..cae2a9a --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml @@ -0,0 +1,44 @@ +Models: +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py + In Collection: ViPNAS + Metadata: + Architecture: &id001 + - ViPNAS + Training Data: COCO-WholeBody + Name: td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.619 + Body AR: 0.7 + Face AP: 0.585 + Face AR: 0.689 + Foot AP: 0.477 + Foot AR: 0.608 + Hand AP: 0.386 + Hand AR: 0.505 + Whole AP: 0.473 + Whole AR: 0.578 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192-0fee581a_20211205.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py + In Collection: ViPNAS + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.643 + Body AR: 0.726 + Face AP: 0.587 + Face AR: 0.698 + Foot AP: 0.553 + Foot AR: 0.694 + Hand AP: 0.41 + Hand AR: 0.529 + Whole AP: 0.495 + Whole AR: 0.607 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192-49e1c3a4_20211112.pth diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md new file mode 100644 index 0000000..6bc5624 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md @@ -0,0 +1,55 @@ + + +
+ViPNAS (CVPR'2021) + +```bibtex +@article{xu2021vipnas, + title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search}, + author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + year={2021} +} +``` + +
+ + + +
+DarkPose (CVPR'2020) + +```bibtex +@inproceedings{zhang2020distribution, + title={Distribution-aware coordinate representation for human pose estimation}, + author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={7093--7102}, + year={2020} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [S-ViPNAS-MobileNetV3_dark](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.632 | 0.710 | 0.530 | 0.660 | 0.672 | 0.771 | 0.404 | 0.519 | 0.508 | 0.607 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_dark-e2158108_20211205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_dark_20211205.log.json) | +| [S-ViPNAS-Res50_dark](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.650 | 0.732 | 0.550 | 0.686 | 0.684 | 0.783 | 0.437 | 0.554 | 0.528 | 0.632 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_dark-67c0ce35_20211112.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_dark_20211112.log.json) | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml new file mode 100644 index 0000000..0f10316 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml @@ -0,0 +1,45 @@ +Models: +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py + In Collection: ViPNAS + Metadata: + Architecture: &id001 + - ViPNAS + - DarkPose + Training Data: COCO-WholeBody + Name: td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.632 + Body AR: 0.71 + Face AP: 0.672 + Face AR: 0.771 + Foot AP: 0.53 + Foot AR: 0.66 + Hand AP: 0.404 + Hand AR: 0.519 + Whole AP: 0.508 + Whole AR: 0.607 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_dark-e2158108_20211205.pth +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py + In Collection: ViPNAS + Metadata: + Architecture: *id001 + Training Data: COCO-WholeBody + Name: td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.65 + Body AR: 0.732 + Face AP: 0.684 + Face AR: 0.783 + Foot AP: 0.55 + Foot AR: 0.686 + Hand AP: 0.437 + Hand AR: 0.554 + Whole AP: 0.528 + Whole AR: 0.632 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_dark-67c0ce35_20211112.pth diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_coco-wholebody.yml new file mode 100644 index 0000000..736e0bd --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_coco-wholebody.yml @@ -0,0 +1,23 @@ +Models: +- Config: configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/td-hm_hrnet-w32_8xb64-210e_ubody-256x192.py + In Collection: HRNet + Metadata: + Architecture: &id001 + - HRNet + Training Data: UBody-COCO-WholeBody + Name: td-hm_hrnet-w32_8xb64-210e_ubody-256x192 + Results: + - Dataset: COCO-WholeBody + Metrics: + Body AP: 0.678 + Body AR: 0.755 + Face AP: 0.630 + Face AR: 0.708 + Foot AP: 0.543 + Foot AR: 0.661 + Hand AP: 0.467 + Hand AR: 0.566 + Whole AP: 0.536 + Whole AR: 0.636 + Task: Wholebody 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/wholebody_2d_keypoint/ubody/td-hm_hrnet-w32_8xb64-210e_ubody-coco-256x192-7c227391_20230807.pth diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_ubody-coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_ubody-coco-wholebody.md new file mode 100644 index 0000000..cb4a4cf --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_ubody-coco-wholebody.md @@ -0,0 +1,38 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+UBody (CVPR'2023) + +```bibtex +@article{lin2023one, + title={One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer}, + author={Lin, Jing and Zeng, Ailing and Wang, Haoqian and Zhang, Lei and Li, Yu}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + year={2023}, +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [pose_hrnet_w32](/configs/wholebody_2d_keypoint/topdown_heatmap/ubody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.685 | 0.759 | 0.564 | 0.675 | 0.625 | 0.705 | 0.516 | 0.609 | 0.549 | 0.646 | [ckpt](https://download.openmmlab.com/mmpose/v1/wholebody_2d_keypoint/ubody/td-hm_hrnet-w32_8xb64-210e_ubody-coco-256x192-7c227391_20230807.pth) | [log](https://download.openmmlab.com/mmpose/v1/wholebody_2d_keypoint/ubody/td-hm_hrnet-w32_8xb64-210e_ubody-coco-256x192-7c227391_20230807.json) | diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/td-hm_hrnet-w32_8xb64-210e_ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/td-hm_hrnet-w32_8xb64-210e_ubody-256x192.py new file mode 100644 index 0000000..38aa1a9 --- /dev/null +++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/td-hm_hrnet-w32_8xb64-210e_ubody-256x192.py @@ -0,0 +1,173 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco-wholebody/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=133, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'UBody2dDataset' +data_mode = 'topdown' +data_root = 'data/UBody/' + +scenes = [ + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] + +train_datasets = [ + dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[]) +] + +for scene in scenes: + train_dataset = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file=f'annotations/{scene}/train_annotations.json', + data_prefix=dict(img='images/'), + pipeline=[], + sample_interval=10) + train_datasets.append(train_dataset) + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=train_datasets, + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='data/coco/val2017/'), + pipeline=val_pipeline, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_1class.py b/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_1class.py new file mode 100644 index 0000000..4e4b8e3 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_1class.py @@ -0,0 +1,270 @@ +# runtime settings +default_scope = 'mmdet' + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='DetVisualizationHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) + +log_level = 'INFO' +load_from = None +resume = False + +# model settings +model = dict( + type='CascadeRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ]), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + format_only=False) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_coco.py b/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_coco.py new file mode 100644 index 0000000..5b9d43a --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_coco.py @@ -0,0 +1,256 @@ +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] + +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[16, 19]) +total_epochs = 20 + +# model settings +model = dict( + type='CascadeRCNN', + pretrained='open-mmlab://resnext101_64x4d', + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ]), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) + +dataset_type = 'CocoDataset' +data_root = 'data/coco' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_train2017.json', + img_prefix=f'{data_root}/train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_val2017.json', + img_prefix=f'{data_root}/val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_val2017.json', + img_prefix=f'{data_root}/val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='bbox') diff --git a/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_1class.py b/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_1class.py new file mode 100644 index 0000000..e1fc35d --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_1class.py @@ -0,0 +1,182 @@ +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 11]) +total_epochs = 12 + +model = dict( + type='FasterRCNN', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) + )) + +dataset_type = 'CocoDataset' +data_root = 'data/coco' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_train2017.json', + img_prefix=f'{data_root}/train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_val2017.json', + img_prefix=f'{data_root}/val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_val2017.json', + img_prefix=f'{data_root}/val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='bbox') diff --git a/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py b/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py new file mode 100644 index 0000000..74d5498 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py @@ -0,0 +1,196 @@ +# runtime settings +default_scope = 'mmdet' + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='DetVisualizationHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) + +log_level = 'INFO' +load_from = None +resume = False + +# model settings +model = dict( + type='FasterRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) + )) + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + format_only=False) +test_evaluator = val_evaluator diff --git a/modules/rtmpose/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py b/modules/rtmpose/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py new file mode 100644 index 0000000..226ae25 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py @@ -0,0 +1,242 @@ +model = dict( + type='MaskRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='CocoDataset', + ann_file='data/coco/annotations/instances_train2017.json', + img_prefix='data/coco/train2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) + ]), + val=dict( + type='CocoDataset', + ann_file='data/coco/annotations/instances_val2017.json', + img_prefix='data/coco/val2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) + ]), + test=dict( + type='CocoDataset', + ann_file='data/coco/annotations/instances_val2017.json', + img_prefix='data/coco/val2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) + ])) +evaluation = dict(metric=['bbox', 'segm']) +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[16, 22]) +runner = dict(type='EpochBasedRunner', max_epochs=24) +checkpoint_config = dict(interval=1) +log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) +custom_hooks = [dict(type='NumClassCheckHook')] +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py b/modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py new file mode 100644 index 0000000..87bc3c8 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py @@ -0,0 +1,20 @@ +_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa + +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + bbox_head=dict(num_classes=1), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +train_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', )))) + +val_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', )))) +test_dataloader = val_dataloader diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_m_8xb32-300e_coco.py b/modules/rtmpose/mmdetection_cfg/rtmdet_m_8xb32-300e_coco.py new file mode 100644 index 0000000..7cca574 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/rtmdet_m_8xb32-300e_coco.py @@ -0,0 +1 @@ +_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py' diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_coco-person.py b/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_coco-person.py new file mode 100644 index 0000000..a681da0 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_coco-person.py @@ -0,0 +1,104 @@ +_base_ = 'mmdet::rtmdet/rtmdet_l_8xb32-300e_coco.py' + +input_shape = 320 + +model = dict( + backbone=dict( + deepen_factor=0.33, + widen_factor=0.25, + use_depthwise=True, + ), + neck=dict( + in_channels=[64, 128, 256], + out_channels=64, + num_csp_blocks=1, + use_depthwise=True, + ), + bbox_head=dict( + in_channels=64, + feat_channels=64, + share_conv=False, + exp_on_reg=False, + use_depthwise=True, + num_classes=1), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='CachedMosaic', + img_scale=(input_shape, input_shape), + pad_val=114.0, + max_cached_images=20, + random_pop=False), + dict( + type='RandomResize', + scale=(input_shape * 2, input_shape * 2), + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(input_shape, input_shape)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(input_shape, input_shape), + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(input_shape, input_shape)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(input_shape, input_shape), keep_ratio=True), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + dataset=dict(pipeline=train_pipeline, metainfo=dict(classes=('person', )))) + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=('person', )))) +test_dataloader = val_dataloader + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='PipelineSwitchHook', + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_hand.py b/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_hand.py new file mode 100644 index 0000000..abec3c0 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_hand.py @@ -0,0 +1,171 @@ +_base_ = 'mmdet::rtmdet/rtmdet_l_8xb32-300e_coco.py' + +input_shape = 320 + +model = dict( + backbone=dict( + deepen_factor=0.33, + widen_factor=0.25, + use_depthwise=True, + ), + neck=dict( + in_channels=[64, 128, 256], + out_channels=64, + num_csp_blocks=1, + use_depthwise=True, + ), + bbox_head=dict( + in_channels=64, + feat_channels=64, + share_conv=False, + exp_on_reg=False, + use_depthwise=True, + num_classes=1), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({'data/': 's3://openmmlab/datasets/'})) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='CachedMosaic', + img_scale=(input_shape, input_shape), + pad_val=114.0, + max_cached_images=20, + random_pop=False), + dict( + type='RandomResize', + scale=(input_shape * 2, input_shape * 2), + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(input_shape, input_shape)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(input_shape, input_shape), + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(input_shape, input_shape)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(input_shape, input_shape), keep_ratio=True), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +data_mode = 'topdown' +data_root = 'data/' + +train_dataset = dict( + _delete_=True, + type='ConcatDataset', + datasets=[ + dict( + type='mmpose.OneHand10KDataset', + data_root=data_root, + data_mode=data_mode, + pipeline=train_pipeline, + ann_file='onehand10k/annotations/onehand10k_train.json', + data_prefix=dict(img='pose/OneHand10K/')), + dict( + type='mmpose.FreiHandDataset', + data_root=data_root, + data_mode=data_mode, + pipeline=train_pipeline, + ann_file='freihand/annotations/freihand_train.json', + data_prefix=dict(img='pose/FreiHand/')), + dict( + type='mmpose.Rhd2DDataset', + data_root=data_root, + data_mode=data_mode, + pipeline=train_pipeline, + ann_file='rhd/annotations/rhd_train.json', + data_prefix=dict(img='pose/RHD/')), + dict( + type='mmpose.HalpeHandDataset', + data_root=data_root, + data_mode=data_mode, + pipeline=train_pipeline, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict( + img='pose/Halpe/hico_20160224_det/images/train2015/') # noqa + ) + ], + ignore_keys=[ + 'CLASSES', 'dataset_keypoint_weights', 'dataset_name', 'flip_indices', + 'flip_pairs', 'keypoint_colors', 'keypoint_id2name', + 'keypoint_name2id', 'lower_body_ids', 'num_keypoints', + 'num_skeleton_links', 'sigmas', 'skeleton_link_colors', + 'skeleton_links', 'upper_body_ids' + ], +) + +test_dataset = dict( + _delete_=True, + type='mmpose.OneHand10KDataset', + data_root=data_root, + data_mode=data_mode, + pipeline=test_pipeline, + ann_file='onehand10k/annotations/onehand10k_test.json', + data_prefix=dict(img='pose/OneHand10K/'), +) + +train_dataloader = dict(dataset=train_dataset) +val_dataloader = dict(dataset=test_dataset) +test_dataloader = val_dataloader + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='PipelineSwitchHook', + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'onehand10k/annotations/onehand10k_test.json', + metric='bbox', + format_only=False) +test_evaluator = val_evaluator + +train_cfg = dict(val_interval=1) diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_tiny_8xb32-300e_coco.py b/modules/rtmpose/mmdetection_cfg/rtmdet_tiny_8xb32-300e_coco.py new file mode 100644 index 0000000..d6a5a64 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/rtmdet_tiny_8xb32-300e_coco.py @@ -0,0 +1 @@ +_base_ = 'mmdet::rtmdet/rtmdet_tiny_8xb32-300e_coco.py' diff --git a/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py b/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py new file mode 100644 index 0000000..ee463f0 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py @@ -0,0 +1,136 @@ +# model settings +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1) +model = dict( + type='SingleStageDetector', + data_preprocessor=data_preprocessor, + backbone=dict( + type='MobileNetV2', + out_indices=(4, 7), + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)), + neck=dict( + type='SSDNeck', + in_channels=(96, 1280), + out_channels=(96, 1280, 512, 256, 256, 128), + level_strides=(2, 2, 2, 2), + level_paddings=(1, 1, 1, 1), + l2_norm_scale=None, + use_depthwise=True, + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + act_cfg=dict(type='ReLU6'), + init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)), + bbox_head=dict( + type='SSDHead', + in_channels=(96, 1280, 512, 256, 256, 128), + num_classes=80, + use_depthwise=True, + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + act_cfg=dict(type='ReLU6'), + init_cfg=dict(type='Normal', layer='Conv2d', std=0.001), + + # set anchor size manually instead of using the predefined + # SSD300 setting. + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + strides=[16, 32, 64, 107, 160, 320], + ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]], + min_sizes=[48, 100, 150, 202, 253, 304], + max_sizes=[100, 150, 202, 253, 304, 320]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2])), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0., + ignore_iof_thr=-1, + gt_max_assign_all=False), + sampler=dict(type='PseudoSampler'), + smoothl1_beta=1., + allowed_border=-1, + pos_weight=-1, + neg_pos_ratio=3, + debug=False), + test_cfg=dict( + nms_pre=1000, + nms=dict(type='nms', iou_threshold=0.45), + min_bbox_size=0, + score_thr=0.02, + max_per_img=200)) +env_cfg = dict(cudnn_benchmark=True) + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +input_size = 320 +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=data_preprocessor['mean'], + to_rgb=data_preprocessor['bgr_to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=24, + num_workers=4, + batch_sampler=None, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=8, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2_scratch_600e_onehand.py b/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2_scratch_600e_onehand.py new file mode 100644 index 0000000..1448916 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2_scratch_600e_onehand.py @@ -0,0 +1,153 @@ +# ========================================================= +# from 'mmdetection/configs/_base_/default_runtime.py' +# ========================================================= +default_scope = 'mmdet' +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +custom_hooks = [dict(type='NumClassCheckHook')] +# ========================================================= + +# model settings +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1) +model = dict( + type='SingleStageDetector', + data_preprocessor=data_preprocessor, + backbone=dict( + type='MobileNetV2', + out_indices=(4, 7), + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)), + neck=dict( + type='SSDNeck', + in_channels=(96, 1280), + out_channels=(96, 1280, 512, 256, 256, 128), + level_strides=(2, 2, 2, 2), + level_paddings=(1, 1, 1, 1), + l2_norm_scale=None, + use_depthwise=True, + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + act_cfg=dict(type='ReLU6'), + init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)), + bbox_head=dict( + type='SSDHead', + in_channels=(96, 1280, 512, 256, 256, 128), + num_classes=1, + use_depthwise=True, + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + act_cfg=dict(type='ReLU6'), + init_cfg=dict(type='Normal', layer='Conv2d', std=0.001), + + # set anchor size manually instead of using the predefined + # SSD300 setting. + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + strides=[16, 32, 64, 107, 160, 320], + ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]], + min_sizes=[48, 100, 150, 202, 253, 304], + max_sizes=[100, 150, 202, 253, 304, 320]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2])), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0., + ignore_iof_thr=-1, + gt_max_assign_all=False), + sampler=dict(type='PseudoSampler'), + smoothl1_beta=1., + allowed_border=-1, + pos_weight=-1, + neg_pos_ratio=3, + debug=False), + test_cfg=dict( + nms_pre=1000, + nms=dict(type='nms', iou_threshold=0.45), + min_bbox_size=0, + score_thr=0.02, + max_per_img=200)) +cudnn_benchmark = True + +# dataset settings +file_client_args = dict(backend='disk') + +dataset_type = 'CocoDataset' +data_root = 'data/onehand10k/' +classes = ('hand', ) +input_size = 320 +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + batch_size=8, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/onehand10k_test.json', + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +optimizer = dict(type='SGD', lr=0.015, momentum=0.9, weight_decay=4.0e-5) +optimizer_config = dict(grad_clip=None) + +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + min_lr=0) +runner = dict(type='EpochBasedRunner', max_epochs=120) + +# Avoid evaluation and saving weights too frequently +evaluation = dict(interval=5, metric='bbox') +checkpoint_config = dict(interval=5) +custom_hooks = [ + dict(type='NumClassCheckHook'), + dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW') +] + +log_config = dict(interval=5) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (24 samples per GPU) +auto_scale_lr = dict(base_batch_size=192) + +load_from = 'https://download.openmmlab.com/mmdetection/' +'v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/' +'ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth' + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') diff --git a/modules/rtmpose/mmdetection_cfg/yolov3_d53_320_273e_coco.py b/modules/rtmpose/mmdetection_cfg/yolov3_d53_320_273e_coco.py new file mode 100644 index 0000000..2d3efab --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/yolov3_d53_320_273e_coco.py @@ -0,0 +1,140 @@ +# model settings +model = dict( + type='YOLOV3', + pretrained='open-mmlab://darknet53', + backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)), + neck=dict( + type='YOLOV3Neck', + num_scales=3, + in_channels=[1024, 512, 256], + out_channels=[512, 256, 128]), + bbox_head=dict( + type='YOLOV3Head', + num_classes=80, + in_channels=[512, 256, 128], + out_channels=[1024, 512, 256], + anchor_generator=dict( + type='YOLOAnchorGenerator', + base_sizes=[[(116, 90), (156, 198), (373, 326)], + [(30, 61), (62, 45), (59, 119)], + [(10, 13), (16, 30), (33, 23)]], + strides=[32, 16, 8]), + bbox_coder=dict(type='YOLOBBoxCoder'), + featmap_strides=[32, 16, 8], + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_conf=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_xy=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=2.0, + reduction='sum'), + loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='GridAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0)), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + conf_thr=0.005, + nms=dict(type='nms', iou_threshold=0.45), + max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco' +img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='PhotoMetricDistortion'), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='Resize', img_scale=(320, 320), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(320, 320), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img']) + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_train2017.json', + img_prefix=f'{data_root}/train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_val2017.json', + img_prefix=f'{data_root}/val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=f'{data_root}/annotations/instances_val2017.json', + img_prefix=f'{data_root}/val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=2000, # same as burn-in in darknet + warmup_ratio=0.1, + step=[218, 246]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=273) +evaluation = dict(interval=1, metric=['bbox']) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +custom_hooks = [dict(type='NumClassCheckHook')] + +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/modules/rtmpose/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py b/modules/rtmpose/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py new file mode 100644 index 0000000..225d2c6 --- /dev/null +++ b/modules/rtmpose/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py @@ -0,0 +1,300 @@ +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300, val_interval=10) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +param_scheduler = [ + dict( + type='mmdet.QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0005, + begin=5, + T_max=285, + end=285, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=285, end=300) +] +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0)) +auto_scale_lr = dict(enable=False, base_batch_size=64) +default_scope = 'mmdet' +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='DetVisualizationHook')) +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', + vis_backends=[dict(type='LocalVisBackend')], + name='visualizer') +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) +log_level = 'INFO' +load_from = 'https://download.openmmlab.com/mmdetection/' \ + 'v2.0/yolox/yolox_s_8x8_300e_coco/' \ + 'yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth' +resume = False +img_scale = (640, 640) +model = dict( + type='YOLOX', + data_preprocessor=dict( + type='DetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=10) + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(2, 3, 4), + use_depthwise=False, + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + neck=dict( + type='YOLOXPAFPN', + in_channels=[128, 256, 512], + out_channels=128, + num_csp_blocks=1, + use_depthwise=False, + upsample_cfg=dict(scale_factor=2, mode='nearest'), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + bbox_head=dict( + type='YOLOXHead', + num_classes=1, + in_channels=128, + feat_channels=128, + stacked_convs=2, + strides=(8, 16, 32), + use_depthwise=False, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_obj=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)), + train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), + test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65))) +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +backend_args = dict(backend='local') +train_pipeline = [ + dict(type='Mosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomAffine', scaling_ratio_range=(0.1, 2), + border=(-320, -320)), + dict( + type='MixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') +] +train_dataset = dict( + type='MultiImageMixDataset', + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict(type='LoadImageFromFile', backend_args=dict(backend='local')), + dict(type='LoadAnnotations', with_bbox=True) + ], + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + pipeline=[ + dict(type='Mosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-320, -320)), + dict( + type='MixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict( + type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') + ]) +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=dict(backend='local')), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='MultiImageMixDataset', + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/coco_face_train.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=dict(backend='local')), + dict(type='LoadAnnotations', with_bbox=True) + ], + filter_cfg=dict(filter_empty_gt=False, min_size=32), + metainfo=dict(CLASSES=('person', ), PALETTE=(220, 20, 60))), + pipeline=[ + dict(type='Mosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-320, -320)), + dict( + type='MixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict( + type='FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict(type='PackDetInputs') + ])) +val_dataloader = dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/coco_face_val.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=[ + dict(type='LoadImageFromFile', backend_args=dict(backend='local')), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) + ], + metainfo=dict(CLASSES=('person', ), PALETTE=(220, 20, 60)))) +test_dataloader = dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/coco_face_val.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=[ + dict(type='LoadImageFromFile', backend_args=dict(backend='local')), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) + ], + metainfo=dict(CLASSES=('person', ), PALETTE=(220, 20, 60)))) +val_evaluator = dict( + type='CocoMetric', + ann_file='data/coco/annotations/coco_face_val.json', + metric='bbox') +test_evaluator = dict( + type='CocoMetric', + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') +max_epochs = 300 +num_last_epochs = 15 +interval = 10 +base_lr = 0.01 +custom_hooks = [ + dict(type='YOLOXModeSwitchHook', num_last_epochs=15, priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + strict_load=False, + update_buffers=True, + priority=49) +] +metainfo = dict(CLASSES=('person', ), PALETTE=(220, 20, 60)) +launcher = 'pytorch' diff --git a/modules/rtmpose/mmtracking_cfg/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py b/modules/rtmpose/mmtracking_cfg/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py new file mode 100644 index 0000000..3dd5129 --- /dev/null +++ b/modules/rtmpose/mmtracking_cfg/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py @@ -0,0 +1,321 @@ +model = dict( + detector=dict( + type='FasterRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[1.0, 1.0, 1.0, 1.0], + clip_border=False), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict( + type='SmoothL1Loss', beta=0.1111111111111111, + loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.1, 0.1, 0.2, 0.2], + clip_border=False), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))), + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmtracking/' + 'mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth')), + type='DeepSORT', + motion=dict(type='KalmanFilter', center_only=False), + reid=dict( + type='BaseReID', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + style='pytorch'), + neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1), + head=dict( + type='LinearReIDHead', + num_fcs=1, + in_channels=2048, + fc_channels=1024, + out_channels=128, + num_classes=380, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + loss_pairwise=dict( + type='TripletLoss', margin=0.3, loss_weight=1.0), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU')), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmtracking/' + 'mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth')), + tracker=dict( + type='SortTracker', + obj_score_thr=0.5, + reid=dict( + num_samples=10, + img_scale=(256, 128), + img_norm_cfg=None, + match_score_thr=2.0), + match_iou_thr=0.5, + momentums=None, + num_tentatives=2, + num_frames_retain=100)) +dataset_type = 'MOTChallengeDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadMultiImagesFromFile', to_float32=True), + dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True), + dict( + type='SeqResize', + img_scale=(1088, 1088), + share_params=True, + ratio_range=(0.8, 1.2), + keep_ratio=True, + bbox_clip_border=False), + dict(type='SeqPhotoMetricDistortion', share_params=True), + dict( + type='SeqRandomCrop', + share_params=False, + crop_size=(1088, 1088), + bbox_clip_border=False), + dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5), + dict( + type='SeqNormalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='SeqPad', size_divisor=32), + dict(type='MatchInstances', skip_nomatch=True), + dict( + type='VideoCollect', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices', + 'gt_instance_ids' + ]), + dict(type='SeqDefaultFormatBundle', ref_prefix='ref') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1088, 1088), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='VideoCollect', keys=['img']) + ]) +] +data_root = 'data/MOT17/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='MOTChallengeDataset', + visibility_thr=-1, + ann_file='data/MOT17/annotations/half-train_cocoformat.json', + img_prefix='data/MOT17/train', + ref_img_sampler=dict( + num_ref_imgs=1, + frame_range=10, + filter_key_img=True, + method='uniform'), + pipeline=[ + dict(type='LoadMultiImagesFromFile', to_float32=True), + dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True), + dict( + type='SeqResize', + img_scale=(1088, 1088), + share_params=True, + ratio_range=(0.8, 1.2), + keep_ratio=True, + bbox_clip_border=False), + dict(type='SeqPhotoMetricDistortion', share_params=True), + dict( + type='SeqRandomCrop', + share_params=False, + crop_size=(1088, 1088), + bbox_clip_border=False), + dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5), + dict( + type='SeqNormalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='SeqPad', size_divisor=32), + dict(type='MatchInstances', skip_nomatch=True), + dict( + type='VideoCollect', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices', + 'gt_instance_ids' + ]), + dict(type='SeqDefaultFormatBundle', ref_prefix='ref') + ]), + val=dict( + type='MOTChallengeDataset', + ann_file='data/MOT17/annotations/half-val_cocoformat.json', + img_prefix='data/MOT17/train', + ref_img_sampler=None, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1088, 1088), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='VideoCollect', keys=['img']) + ]) + ]), + test=dict( + type='MOTChallengeDataset', + ann_file='data/MOT17/annotations/half-val_cocoformat.json', + img_prefix='data/MOT17/train', + ref_img_sampler=None, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1088, 1088), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='VideoCollect', keys=['img']) + ]) + ])) +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +checkpoint_config = dict(interval=1) +log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=100, + warmup_ratio=0.01, + step=[3]) +total_epochs = 4 +evaluation = dict(metric=['bbox', 'track'], interval=1) +search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML'] diff --git a/modules/rtmpose/mmtracking_cfg/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py b/modules/rtmpose/mmtracking_cfg/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py new file mode 100644 index 0000000..db94427 --- /dev/null +++ b/modules/rtmpose/mmtracking_cfg/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py @@ -0,0 +1,325 @@ +model = dict( + detector=dict( + type='FasterRCNN', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[1.0, 1.0, 1.0, 1.0], + clip_border=False), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict( + type='SmoothL1Loss', beta=0.1111111111111111, + loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.1, 0.1, 0.2, 0.2], + clip_border=False), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))), + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))), + type='Tracktor', + pretrains=dict( + detector='https://download.openmmlab.com/mmtracking/' + 'mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth', + reid='https://download.openmmlab.com/mmtracking/mot/' + 'reid/reid_r50_6e_mot17-4bf6b63d.pth'), + reid=dict( + type='BaseReID', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + style='pytorch'), + neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1), + head=dict( + type='LinearReIDHead', + num_fcs=1, + in_channels=2048, + fc_channels=1024, + out_channels=128, + num_classes=378, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + loss_pairwise=dict( + type='TripletLoss', margin=0.3, loss_weight=1.0), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU'))), + motion=dict( + type='CameraMotionCompensation', + warp_mode='cv2.MOTION_EUCLIDEAN', + num_iters=100, + stop_eps=1e-05), + tracker=dict( + type='TracktorTracker', + obj_score_thr=0.5, + regression=dict( + obj_score_thr=0.5, + nms=dict(type='nms', iou_threshold=0.6), + match_iou_thr=0.3), + reid=dict( + num_samples=10, + img_scale=(256, 128), + img_norm_cfg=None, + match_score_thr=2.0, + match_iou_thr=0.2), + momentums=None, + num_frames_retain=10)) +dataset_type = 'MOTChallengeDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadMultiImagesFromFile', to_float32=True), + dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True), + dict( + type='SeqResize', + img_scale=(1088, 1088), + share_params=True, + ratio_range=(0.8, 1.2), + keep_ratio=True, + bbox_clip_border=False), + dict(type='SeqPhotoMetricDistortion', share_params=True), + dict( + type='SeqRandomCrop', + share_params=False, + crop_size=(1088, 1088), + bbox_clip_border=False), + dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5), + dict( + type='SeqNormalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='SeqPad', size_divisor=32), + dict(type='MatchInstances', skip_nomatch=True), + dict( + type='VideoCollect', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices', + 'gt_instance_ids' + ]), + dict(type='SeqDefaultFormatBundle', ref_prefix='ref') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1088, 1088), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='VideoCollect', keys=['img']) + ]) +] +data_root = 'data/MOT17/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type='MOTChallengeDataset', + visibility_thr=-1, + ann_file='data/MOT17/annotations/train_cocoformat.json', + img_prefix='data/MOT17/train', + ref_img_sampler=dict( + num_ref_imgs=1, + frame_range=10, + filter_key_img=True, + method='uniform'), + pipeline=[ + dict(type='LoadMultiImagesFromFile', to_float32=True), + dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True), + dict( + type='SeqResize', + img_scale=(1088, 1088), + share_params=True, + ratio_range=(0.8, 1.2), + keep_ratio=True, + bbox_clip_border=False), + dict(type='SeqPhotoMetricDistortion', share_params=True), + dict( + type='SeqRandomCrop', + share_params=False, + crop_size=(1088, 1088), + bbox_clip_border=False), + dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5), + dict( + type='SeqNormalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='SeqPad', size_divisor=32), + dict(type='MatchInstances', skip_nomatch=True), + dict( + type='VideoCollect', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices', + 'gt_instance_ids' + ]), + dict(type='SeqDefaultFormatBundle', ref_prefix='ref') + ]), + val=dict( + type='MOTChallengeDataset', + ann_file='data/MOT17/annotations/train_cocoformat.json', + img_prefix='data/MOT17/train', + ref_img_sampler=None, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1088, 1088), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='VideoCollect', keys=['img']) + ]) + ]), + test=dict( + type='MOTChallengeDataset', + ann_file='data/MOT17/annotations/train_cocoformat.json', + img_prefix='data/MOT17/train', + ref_img_sampler=None, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1088, 1088), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict( + type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='VideoCollect', keys=['img']) + ]) + ])) +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +checkpoint_config = dict(interval=1) +log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=100, + warmup_ratio=0.01, + step=[3]) +total_epochs = 4 +evaluation = dict(metric=['bbox', 'track'], interval=1) +search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML'] +test_set = 'train' diff --git a/requirements.txt b/requirements.txt index 1d9772d..6437e1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ python-dotenv == 1.0.1 opencv-python == 4.10.0.84 -torch == 2.4.0 -torchvision == 0.19.0 requests == 2.32.3 pandas == 2.2.2 joblib == 1.4.2 -lightgbm == 4.5.0 \ No newline at end of file +lightgbm == 4.5.0 +xgboost == 2.1.1 +scipy == 1.9.3 +numpy == 1.24.0 \ No newline at end of file